Date: (Mon) May 11, 2015
Data: Source: Training: https://kaggle2.blob.core.windows.net/competitions-data/kaggle/4347/NYTimesBlogTrain.csv New: https://kaggle2.blob.core.windows.net/competitions-data/kaggle/4347/NYTimesBlogTest.csv
Time period:
Based on analysis utilizing <> techniques,
Use plot.ly for interactive plots ?
varImp for randomForest crashes in caret version:6.0.41 -> submit bug report
extensions toward multiclass classification are scheduled for the next release
glm_dmy_mdl should use the same method as glm_sel_mdl until custom dummy classifer is implemented
rm(list=ls())
set.seed(12345)
options(stringsAsFactors=FALSE)
source("~/Dropbox/datascience/R/myscript.R")
source("~/Dropbox/datascience/R/mydsutils.R")
source("~/Dropbox/datascience/R/myplot.R")
source("~/Dropbox/datascience/R/mypetrinet.R")
# Gather all package requirements here
#suppressPackageStartupMessages(require())
#packageVersion("snow")
#require(sos); findFn("pinv", maxPages=2, sortby="MaxScore")
# Analysis control global variables
glb_trnng_url <- "https://kaggle2.blob.core.windows.net/competitions-data/kaggle/4347/NYTimesBlogTrain.csv"
glb_newdt_url <- "https://kaggle2.blob.core.windows.net/competitions-data/kaggle/4347/NYTimesBlogTest.csv"
glb_out_pfx <- "NYTBlogs_myEducation_"
glb_is_separate_newent_dataset <- TRUE # or TRUE
glb_split_entity_newent_datasets <- TRUE # or FALSE
glb_split_newdata_method <- "sample" # "condition" or "sample" or "copy"
glb_split_newdata_condition <- "<col_name> <condition_operator> <value>" # or NULL
glb_split_newdata_size_ratio <- 0.3 # > 0 & < 1
glb_split_sample.seed <- 123 # or any integer
glb_drop_vars <- c(NULL) # or c("<col_name>")
#glb_max_fitent_obs <- 2238 # NULL # or any integer
glb_max_fitent_obs <- NULL # or any integer
glb_is_regression <- FALSE; glb_is_classification <- TRUE; glb_is_binomial <- TRUE
glb_rsp_var_raw <- "Popular"
# for classification, the response variable has to be a factor
glb_rsp_var <- "Popular.fctr"
# if the response factor is based on numbers e.g (0/1 vs. "A"/"B"),
# caret predict(..., type="prob") crashes
glb_map_rsp_raw_to_var <- function(raw) {
relevel(factor(ifelse(raw == 1, "Y", "N")), as.factor(c("Y", "N")), ref="N")
#as.factor(paste0("B", raw))
#as.factor(raw)
}
glb_map_rsp_raw_to_var(c(1, 1, 0, 0, 0))
## [1] Y Y N N N
## Levels: N Y
glb_map_rsp_var_to_raw <- function(var) {
as.numeric(var) - 1
#as.numeric(var)
#levels(var)[as.numeric(var)]
#c(" <=50K", " >50K")[as.numeric(var)]
}
glb_map_rsp_var_to_raw(glb_map_rsp_raw_to_var(c(1, 1, 0, 0, 0)))
## [1] 1 1 0 0 0
if ((glb_rsp_var != glb_rsp_var_raw) & is.null(glb_map_rsp_raw_to_var))
stop("glb_map_rsp_raw_to_var function expected")
glb_rsp_var_out <- paste0(glb_rsp_var, ".predict.") # model_id is appended later
# List info gathered for various columns
# <col_name>: <description>; <notes>
# NewsDesk = the New York Times desk that produced the story
# SectionName = the section the article appeared in (Opinion, Arts, Technology, etc.)
# SubsectionName = the subsection the article appeared in (Education, Small Business, Room for Debate, etc.)
# Headline = the title of the article
# Snippet = a small portion of the article text
# Abstract = a summary of the blog article, written by the New York Times
# WordCount = the number of words in the article
# created WordCount.log
# PubDate = the publication date, in the format "Year-Month-Day Hour:Minute:Second"
glb_date_vars <- c("PubDate")
# UniqueID = a unique identifier for each article
glb_id_vars <- c("UniqueID")
glb_is_textual <- TRUE # vs. glb_is_numerical ???
#Sys.setlocale("LC_ALL", "C") # For english
glb_txt_vars <- c("Headline", "Snippet", "Abstract")
glb_append_stop_words <- list() # NULL # or c("<freq_word>")
# Properties:
# numrows(glb_feats_df) << numrows(glb_fitent_df)
# Select terms that appear in at least 0.2 * O(FP/FN(glb_OOBent_df))
# numrows(glb_OOBent_df) = 1.1 * numrows(glb_newent_df)
#glb_sprs_thresholds <- c(0.982, 0.965, 0.965)
glb_sprs_thresholds <- c(0.982, 0.970, 0.970)
names(glb_sprs_thresholds) <- glb_txt_vars
# List transformed vars
glb_exclude_vars_as_features <- c(NULL) # or c("<var_name>")
if (glb_is_textual)
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
glb_txt_vars)
if (glb_rsp_var_raw != glb_rsp_var)
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
glb_rsp_var_raw)
# List feats that shd be excluded due to known causation by prediction variable
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
c(NULL)) # or c("<col_name>")
# List output vars (useful during testing in console)
# glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
# grep(glb_rsp_var_out, names(glb_trnent_df), value=TRUE))
glb_impute_na_data <- TRUE # or TRUE
glb_mice_complete.seed <- 144 # or any integer
glb_models_lst <- list(); glb_models_df <- data.frame()
# rpart: .rnorm messes with the models badly
# caret creates dummy vars for factor feats which messes up the tuning
# - better to feed as.numeric(<feat>.fctr) to caret
# Regression
if (glb_is_regression)
glb_models_method_vctr <- c("lm", "glm", "rpart", "rf") else
# Classification
if (glb_is_binomial)
glb_models_method_vctr <- c("glm", "rpart", "rf") else
glb_models_method_vctr <- c("rpart", "rf")
# Baseline prediction model feature(s)
glb_Baseline_mdl_var <- NULL # or c("<col_name>")
glb_model_metric_terms <- NULL # or matrix(c(
# 0,1,2,3,4,
# 2,0,1,2,3,
# 4,2,0,1,2,
# 6,4,2,0,1,
# 8,6,4,2,0
# ), byrow=TRUE, nrow=5)
glb_model_metric <- NULL # or "<metric_name>"
glb_model_metric_maximize <- NULL # or FALSE (TRUE is not the default for both classification & regression)
glb_model_metric_smmry <- NULL # or function(data, lev=NULL, model=NULL) {
# confusion_mtrx <- t(as.matrix(confusionMatrix(data$pred, data$obs)))
# #print(confusion_mtrx)
# #print(confusion_mtrx * glb_model_metric_terms)
# metric <- sum(confusion_mtrx * glb_model_metric_terms) / nrow(data)
# names(metric) <- glb_model_metric
# return(metric)
# }
glb_tune_models_df <-
rbind(
#data.frame(parameter="cp", min=0.00005, max=0.00005, by=0.000005),
#seq(from=0.01, to=0.01, by=0.01)
#data.frame(parameter="mtry", min=2, max=4, by=1),
data.frame(parameter="dummy", min=2, max=4, by=1)
)
# or NULL
glb_n_cv_folds <- 3 # or NULL
glb_clf_proba_threshold <- NULL # 0.5
# Model selection criteria
if (glb_is_regression)
glb_model_evl_criteria <- c("min.RMSE.OOB", "max.R.sq.OOB", "max.Adj.R.sq.fit")
if (glb_is_classification) {
if (glb_is_binomial)
glb_model_evl_criteria <-
c("max.Accuracy.OOB", "max.auc.OOB", "max.Kappa.OOB", "min.aic.fit") else
glb_model_evl_criteria <- c("max.Accuracy.OOB", "max.Kappa.OOB")
}
glb_sel_mdl_id <- NULL # or "<model_id_prefix>.<model_method>"
glb_fin_mdl_id <- glb_sel_mdl_id # or "Final"
# Depict process
glb_analytics_pn <- petrinet(name="glb_analytics_pn",
trans_df=data.frame(id=1:6,
name=c("data.training.all","data.new",
"model.selected","model.final",
"data.training.all.prediction","data.new.prediction"),
x=c( -5,-5,-15,-25,-25,-35),
y=c( -5, 5, 0, 0, -5, 5)
),
places_df=data.frame(id=1:4,
name=c("bgn","fit.data.training.all","predict.data.new","end"),
x=c( -0, -20, -30, -40),
y=c( 0, 0, 0, 0),
M0=c( 3, 0, 0, 0)
),
arcs_df=data.frame(
begin=c("bgn","bgn","bgn",
"data.training.all","model.selected","fit.data.training.all",
"fit.data.training.all","model.final",
"data.new","predict.data.new",
"data.training.all.prediction","data.new.prediction"),
end =c("data.training.all","data.new","model.selected",
"fit.data.training.all","fit.data.training.all","model.final",
"data.training.all.prediction","predict.data.new",
"predict.data.new","data.new.prediction",
"end","end")
))
#print(ggplot.petrinet(glb_analytics_pn))
print(ggplot.petrinet(glb_analytics_pn) + coord_flip())
## Loading required package: grid
glb_analytics_avl_objs <- NULL
glb_chunks_df <- myadd_chunk(NULL, "import.data")
## label step_major step_minor bgn end elapsed
## 1 import.data 1 0 9.123 NA NA
1.0: import dataglb_trnent_df <- myimport_data(url=glb_trnng_url, comment="glb_trnent_df",
force_header=TRUE)
## [1] "Reading file ./data/NYTimesBlogTrain.csv..."
## [1] "dimensions of data in ./data/NYTimesBlogTrain.csv: 6,532 rows x 10 cols"
## NewsDesk SectionName SubsectionName
## 1 Business Crosswords/Games
## 2 Culture Arts
## 3 Business Business Day Dealbook
## 4 Business Business Day Dealbook
## 5 Science Health
## 6 Science Health
## Headline
## 1 More School Daze
## 2 New 96-Page Murakami Work Coming in December
## 3 Public Pension Funds Stay Mum on Corporate Expats
## 4 Boot Camp for Bankers
## 5 Of Little Help to Older Knees
## 6 A Benefit of Legal Marijuana
## Snippet
## 1 A puzzle from Ethan Cooper that reminds me that a bill is due.
## 2 The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His Years of Pilgrimage.
## 3 Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little about the strategy, which could hurt the nations tax base.
## 4 As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service members ideal customers.
## 5 Middle-aged and older patients are unlikely to benefit in the long term from surgery to repair tears in the meniscus, pads of cartilage in the knee, a new review of studies has found.
## 6 A new study has found evidence that legal access to marijuana is associated with fewer opioid overdose deaths, but researchers said their findings should not be used as the basis for the wide adoption of legalized cannabis.
## Abstract
## 1 A puzzle from Ethan Cooper that reminds me that a bill is due.
## 2 The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His Years of Pilgrimage.
## 3 Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little about the strategy, which could hurt the nations tax base.
## 4 As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service members ideal customers.
## 5 Middle-aged and older patients are unlikely to benefit in the long term from surgery to repair tears in the meniscus, pads of cartilage in the knee, a new review of studies has found.
## 6 A new study has found evidence that legal access to marijuana is associated with fewer opioid overdose deaths, but researchers said their findings should not be used as the basis for the wide adoption of legalized cannabis.
## WordCount PubDate Popular UniqueID
## 1 508 2014-09-01 22:00:09 1 1
## 2 285 2014-09-01 21:14:07 0 2
## 3 1211 2014-09-01 21:05:36 0 3
## 4 1405 2014-09-01 20:43:34 1 4
## 5 181 2014-09-01 18:58:51 1 5
## 6 245 2014-09-01 18:52:22 1 6
## NewsDesk SectionName SubsectionName
## 226 Styles
## 995
## 2124 TStyle
## 3326 TStyle
## 4752 Business Technology
## 6462 Foreign
## Headline
## 226 For Tavi Gevinson, Fashion Takes a Back Seat, for Now
## 995 Reconsidering What to Call an Extremist Group
## 2124 Paris Fashion Week: Kenzo Spring/Summer 2015
## 3326 The Portable Blue Bottle
## 4752 Monster Moves to Restore a Faded Job Search Brand
## 6462 1889: Priest Questions the Meridian of Greenwich
## Snippet
## 226 Tavi Gevinson, the teenage fashion star turned Broadway actress, wont be much of a player at New York Fashion Week this season.
## 995 Editors have decided to adjust how The Times refer to an Islamic extremist group that controls territory in Syria and Iraq.
## 2124 Scenes from the Paris Fashion Week photo diary of Nina Westervelt.
## 3326 The coffee purveyor has teamed up with its fellow Bay Area-based company Timbuk2 to create a travel kit.
## 4752 Monster, which revolutionized online job hunting in the 1990s, is trying to reinvent itself for the era of Twitter and Facebook with new products that capitalize on social media.
## 6462 From the International Herald Tribune archives: Priest Questions the Meridian of Greenwich in 1889.
## Abstract
## 226 Tavi Gevinson, the teenage fashion star turned Broadway actress, wont be much of a player at New York Fashion Week this season.
## 995 Editors have decided to adjust how The Times refer to an Islamic extremist group that controls territory in Syria and Iraq.
## 2124 Scenes from the Paris Fashion Week photo diary of Nina Westervelt.
## 3326 The coffee purveyor has teamed up with its fellow Bay Area-based company Timbuk2 to create a travel kit.
## 4752 Monster, which revolutionized online job hunting in the 1990s, is trying to reinvent itself for the era of Twitter and Facebook with new products that capitalize on social media.
## 6462 From the International Herald Tribune archives: Priest Questions the Meridian of Greenwich in 1889.
## WordCount PubDate Popular UniqueID
## 226 459 2014-09-04 16:55:57 0 226
## 995 301 2014-09-15 16:05:13 0 995
## 2124 59 2014-09-28 11:20:02 0 2124
## 3326 248 2014-10-14 14:45:55 0 3326
## 4752 995 2014-11-02 07:00:31 0 4752
## 6462 110 2014-11-27 12:00:34 0 6462
## NewsDesk SectionName SubsectionName
## 6527 Foreign
## 6528 Opinion Room For Debate
## 6529 Foreign
## 6530 TStyle
## 6531 Multimedia
## 6532 Business
## Headline
## 6527 1914: Russians Dominate in East Poland
## 6528 Finding a Secretary of Defense
## 6529 1889: Metropolitan Opera House Reopens in New York
## 6530 The Daily Gift: Picasso Plates for Creative Dining
## 6531 Racing From New York to Barcelona
## 6532 Math Anxiety: Why Hollywood Makes Robots of Alan Turing and Other Geniuses
## Snippet
## 6527 From the International Herald Tribune archives: Russians dominate in East Poland in 1914.
## 6528 If Chuck Hagel isn't the right Pentagon chief to respond to an onslaught of global crises, who is?
## 6529 From the International Herald Tribune archives: The Metropolitan Opera House reopens in New York in 1889.
## 6530 Each day until Christmas, the editors of T share a new holiday gift idea.
## 6531 A sailboat race from New York to Barcelona was the setting for a thrilling and sometimes terrifying video about this challenging sport.
## 6532 The visionary who stares at formulas written on walls or mirrors or better yet, thin air has become a Hollywood trope. So has the depiction of the genius who cant connect with real people.
## Abstract
## 6527 From the International Herald Tribune archives: Russians dominate in East Poland in 1914.
## 6528 If Chuck Hagel isn't the right Pentagon chief to respond to an onslaught of global crises, who is?
## 6529 From the International Herald Tribune archives: The Metropolitan Opera House reopens in New York in 1889.
## 6530 Each day until Christmas, the editors of T share a new holiday gift idea.
## 6531 A sailboat race from New York to Barcelona was the setting for a thrilling and sometimes terrifying video about this challenging sport.
## 6532 The visionary who stares at formulas written on walls or mirrors or better yet, thin air has become a Hollywood trope. So has the depiction of the genius who cant connect with real people.
## WordCount PubDate Popular UniqueID
## 6527 176 2014-11-30 13:48:40 0 6527
## 6528 1597 2014-11-30 13:27:23 0 6528
## 6529 214 2014-11-30 09:44:57 0 6529
## 6530 61 2014-11-30 09:00:43 0 6530
## 6531 441 2014-11-30 09:00:22 0 6531
## 6532 921 2014-11-30 07:00:40 0 6532
## 'data.frame': 6532 obs. of 10 variables:
## $ NewsDesk : chr "Business" "Culture" "Business" "Business" ...
## $ SectionName : chr "Crosswords/Games" "Arts" "Business Day" "Business Day" ...
## $ SubsectionName: chr "" "" "Dealbook" "Dealbook" ...
## $ Headline : chr "More School Daze" "New 96-Page Murakami Work Coming in December" "Public Pension Funds Stay Mum on Corporate Expats" "Boot Camp for Bankers" ...
## $ Snippet : chr "A puzzle from Ethan Cooper that reminds me that a bill is due." "The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His"| __truncated__ "Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little"| __truncated__ "As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service "| __truncated__ ...
## $ Abstract : chr "A puzzle from Ethan Cooper that reminds me that a bill is due." "The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His"| __truncated__ "Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little"| __truncated__ "As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service "| __truncated__ ...
## $ WordCount : int 508 285 1211 1405 181 245 258 893 1077 188 ...
## $ PubDate : chr "2014-09-01 22:00:09" "2014-09-01 21:14:07" "2014-09-01 21:05:36" "2014-09-01 20:43:34" ...
## $ Popular : int 1 0 0 1 1 1 0 1 1 0 ...
## $ UniqueID : int 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, "comment")= chr "glb_trnent_df"
## NULL
if (glb_is_separate_newent_dataset) {
glb_newent_df <- myimport_data(url=glb_newdt_url, comment="glb_newent_df",
force_header=TRUE)
# To make plots / stats / checks easier in chunk:inspectORexplore.data
glb_entity_df <- myrbind_df(glb_trnent_df, glb_newent_df);
comment(glb_entity_df) <- "glb_entity_df"
} else {
glb_entity_df <- glb_trnent_df; comment(glb_entity_df) <- "glb_entity_df"
if (!glb_split_entity_newent_datasets) {
stop("Not implemented yet")
glb_newent_df <- glb_trnent_df[sample(1:nrow(glb_trnent_df),
max(2, nrow(glb_trnent_df) / 1000)),]
} else if (glb_split_newdata_method == "condition") {
glb_newent_df <- do.call("subset",
list(glb_trnent_df, parse(text=glb_split_newdata_condition)))
glb_trnent_df <- do.call("subset",
list(glb_trnent_df, parse(text=paste0("!(",
glb_split_newdata_condition,
")"))))
} else if (glb_split_newdata_method == "sample") {
require(caTools)
set.seed(glb_split_sample.seed)
split <- sample.split(glb_trnent_df[, glb_rsp_var_raw],
SplitRatio=(1-glb_split_newdata_size_ratio))
glb_newent_df <- glb_trnent_df[!split, ]
glb_trnent_df <- glb_trnent_df[split ,]
} else if (glb_split_newdata_method == "copy") {
glb_trnent_df <- glb_entity_df
comment(glb_trnent_df) <- "glb_trnent_df"
glb_newent_df <- glb_entity_df
comment(glb_newent_df) <- "glb_newent_df"
} else stop("glb_split_newdata_method should be %in% c('condition', 'sample', 'copy')")
comment(glb_newent_df) <- "glb_newent_df"
myprint_df(glb_newent_df)
str(glb_newent_df)
if (glb_split_entity_newent_datasets) {
myprint_df(glb_trnent_df)
str(glb_trnent_df)
}
}
## [1] "Reading file ./data/NYTimesBlogTest.csv..."
## [1] "dimensions of data in ./data/NYTimesBlogTest.csv: 1,870 rows x 9 cols"
## NewsDesk SectionName SubsectionName
## 1 Culture
## 2 Culture Arts
## 3 Business Crosswords/Games
## 4 Business Business Day Dealbook
## 5 Science Health
## 6 Science Health
## Headline
## 1 'Birdman' Tops the Gothams
## 2 'Sleepy Hollow' Recap: A Not-So-Shocking Death
## 3 Drinking Buddy For Falstaff
## 4 Encouraging Public Service, Through Wall Street's 'Revolving Door'
## 5 Therapy Prevents Repeat Suicide Attempts
## 6 Hoping for a Good Death
## Snippet
## 1 The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner.
## 2 In the fall season finale, a question of where the series has many places to go.
## 3 In which Timothy Polin reveals his potty mouth.
## 4 The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than on good public policy.
## 5 Short-term psychotherapy may be an effective way to prevent repeated suicide attempts.
## 6 What I hadnt considered before my fathers heart attack was the precise meaning of not wanting to live hooked up to machines.
## Abstract
## 1 The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner.
## 2 In the fall season finale, a question of where the series has many places to go.
## 3 In which Timothy Polin reveals his potty mouth.
## 4 The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than on good public policy.
## 5 Short-term psychotherapy may be an effective way to prevent repeated suicide attempts.
## 6 What I hadnt considered before my fathers heart attack was the precise meaning of not wanting to live hooked up to machines.
## WordCount PubDate UniqueID
## 1 111 2014-12-01 22:45:24 6533
## 2 558 2014-12-01 22:01:34 6534
## 3 788 2014-12-01 22:00:26 6535
## 4 915 2014-12-01 21:04:13 6536
## 5 213 2014-12-01 19:13:20 6537
## 6 938 2014-12-01 19:05:12 6538
## NewsDesk SectionName SubsectionName
## 3 Business Crosswords/Games
## 725 TStyle
## 731 Business Business Day Dealbook
## 751 TStyle
## 864
## 1376 Business Business Day Small Business
## Headline
## 3 Drinking Buddy For Falstaff
## 725 Ansel Elgort Buttons Up in Brioni
## 731 Didi Dache, a Chinese Ride-Hailing App, Raises $700 Million
## 751 The Daily Gift: A Soft, Colorful Quilt From a Brooklyn Fashion Favorite
## 864 Today in Politics
## 1376 As Health Insurance Evolves, Traditional Brokers Claim They Still Have a Role
## Snippet
## 3 In which Timothy Polin reveals his potty mouth.
## 725 The actor brought a tinge of youthfulness to the classic Italian houses retro-tailored look.
## 731 The Singapore investor Temasek and the Chinese social network operator Tencent are among the leaders of the fund-raising round for a company that says it has 10 times the ridership of Uber.
## 751 Each day until Christmas, the editors of T share a new holiday gift idea.
## 864 The 113th Congress is concluding with partisan brinksmanship and one last mad scramble for votes to pass a $1.1 trillion spending package.
## 1376 Its complex picking insurance for yourself and your family, said a health care policy director for a small-business organization. Its even more complex for a business.
## Abstract
## 3 In which Timothy Polin reveals his potty mouth.
## 725 The actor brought a tinge of youthfulness to the classic Italian houses retro-tailored look.
## 731 The Singapore investor Temasek and the Chinese social network operator Tencent are among the leaders of the fund-raising round for a company that says it has 10 times the ridership of Uber.
## 751 Each day until Christmas, the editors of T share a new holiday gift idea.
## 864 The 113th Congress is concluding with partisan brinksmanship and one last mad scramble for votes to pass a $1.1 trillion spending package.
## 1376 Its complex picking insurance for yourself and your family, said a health care policy director for a small-business organization. Its even more complex for a business.
## WordCount PubDate UniqueID
## 3 788 2014-12-01 22:00:26 6535
## 725 89 2014-12-10 12:30:47 7257
## 731 724 2014-12-10 12:06:32 7263
## 751 85 2014-12-10 09:00:38 7283
## 864 1544 2014-12-11 07:09:25 7396
## 1376 1250 2014-12-18 07:00:05 7908
## NewsDesk SectionName SubsectionName
## 1865
## 1866 Business Technology
## 1867 Metro N.Y. / Region
## 1868 Multimedia
## 1869 Foreign World Asia Pacific
## 1870 Science Health
## Headline
## 1865 Today in Politics
## 1866 Uber Suspends Operations in Spain
## 1867 New York Today: The Year in News
## 1868 New Year, Old Memories, in Times Square
## 1869 Hong Kong Police Criticized After 14-Year-Old's Detention
## 1870 The Super-Short Workout and Other Fitness Trends
## Snippet
## 1865 House Republicans are ending the year on a defensive note over Representative Steve Scalises 2002 speech to a white supremacist group.
## 1866 In a first in the growing pushback against Ubers global expansion, a judges ruling barred telecommunications operators and banks from supporting the companys services.
## 1867 Wednesday: The most read stories of 2014, teeth-chattering cold, and its New Years Eve.
## 1868 What happens when you combine Burning Man, Independence Day fireworks, the last day of school and a full-contact Black Friday sale-a-bration? New Years Eve in Times Square.
## 1869 The authorities have been accused of trying to intimidate young pro-democracy protesters and their families after a 14-year-old girl was detained on suspicion of drawing flowers in chalk near government headquarters and sent to a juvenile home.
## 1870 The big story in exercise science this year was the super-short workout, although many other fitness-related themes emerged in 2014.
## Abstract
## 1865 House Republicans are ending the year on a defensive note over Representative Steve Scalises 2002 speech to a white supremacist group.
## 1866 In a first in the growing pushback against Ubers global expansion, a judges ruling barred telecommunications operators and banks from supporting the companys services.
## 1867 Wednesday: The most read stories of 2014, teeth-chattering cold, and its New Years Eve.
## 1868 What happens when you combine Burning Man, Independence Day fireworks, the last day of school and a full-contact Black Friday sale-a-bration? New Years Eve in Times Square.
## 1869 The authorities have been accused of trying to intimidate young pro-democracy protesters and their families after a 14-year-old girl was detained on suspicion of drawing flowers in chalk near government headquarters and sent to a juvenile home.
## 1870 The big story in exercise science this year was the super-short workout, although many other fitness-related themes emerged in 2014.
## WordCount PubDate UniqueID
## 1865 1616 2014-12-31 07:03:46 8397
## 1866 292 2014-12-31 06:09:32 8398
## 1867 1010 2014-12-31 06:06:58 8399
## 1868 387 2014-12-31 05:00:19 8400
## 1869 717 2014-12-31 04:16:29 8401
## 1870 818 2014-12-31 00:01:10 8402
## 'data.frame': 1870 obs. of 9 variables:
## $ NewsDesk : chr "Culture" "Culture" "Business" "Business" ...
## $ SectionName : chr "" "Arts" "Crosswords/Games" "Business Day" ...
## $ SubsectionName: chr "" "" "" "Dealbook" ...
## $ Headline : chr "'Birdman' Tops the Gothams" "'Sleepy Hollow' Recap: A Not-So-Shocking Death" "Drinking Buddy For Falstaff" "Encouraging Public Service, Through Wall Street's 'Revolving Door'" ...
## $ Snippet : chr "The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner." "In the fall season finale, a question of where the series has many places to go." "In which Timothy Polin reveals his potty mouth." "The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than "| __truncated__ ...
## $ Abstract : chr "The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner." "In the fall season finale, a question of where the series has many places to go." "In which Timothy Polin reveals his potty mouth." "The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than "| __truncated__ ...
## $ WordCount : int 111 558 788 915 213 938 1336 2644 752 99 ...
## $ PubDate : chr "2014-12-01 22:45:24" "2014-12-01 22:01:34" "2014-12-01 22:00:26" "2014-12-01 21:04:13" ...
## $ UniqueID : int 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 ...
## - attr(*, "comment")= chr "glb_newent_df"
## NULL
if (nrow(glb_trnent_df) == nrow(glb_entity_df))
warning("glb_trnent_df same as glb_entity_df")
if (nrow(glb_newent_df) == nrow(glb_entity_df))
warning("glb_newent_df same as glb_entity_df")
if (length(glb_drop_vars) > 0) {
warning("dropping vars: ", paste0(glb_drop_vars, collapse=", "))
glb_entity_df <- glb_entity_df[, setdiff(names(glb_entity_df), glb_drop_vars)]
glb_trnent_df <- glb_trnent_df[, setdiff(names(glb_trnent_df), glb_drop_vars)]
glb_newent_df <- glb_newent_df[, setdiff(names(glb_newent_df), glb_drop_vars)]
}
# Check for duplicates in glb_id_vars
if (length(glb_id_vars) == 0) {
warning("using .rownames as identifiers for observations")
glb_entity_df$.rownames <- rownames(glb_entity_df)
glb_id_vars <- ".rownames"
}
if (sum(duplicated(glb_entity_df[, glb_id_vars, FALSE])) > 0)
stop(glb_id_vars, " duplicated in glb_entity_df")
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, glb_id_vars)
# Combine trnent & newent into glb_entity_df for easier manipulation
glb_trnent_df$.src <- "Train"; glb_newent_df$.src <- "Test";
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, ".src")
glb_entity_df <- myrbind_df(glb_trnent_df, glb_newent_df)
comment(glb_entity_df) <- "glb_entity_df"
glb_trnent_df <- glb_newent_df <- NULL
glb_chunks_df <- myadd_chunk(glb_chunks_df, "inspect.data", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 1 import.data 1 0 9.123 10.393 1.27
## 2 inspect.data 2 0 10.394 NA NA
2.0: inspect data#print(str(glb_entity_df))
#View(glb_entity_df)
dsp_class_dstrb <- function(var) {
xtab_df <- mycreate_xtab_df(glb_entity_df, c(".src", var))
rownames(xtab_df) <- xtab_df$.src
xtab_df <- subset(xtab_df, select=-.src)
print(xtab_df)
print(xtab_df / rowSums(xtab_df, na.rm=TRUE))
}
dsp_problem_data <- function(df) {
print(sprintf("numeric data missing in %s: ",
ifelse(!is.null(df_name <- comment(df)), df_name, "")))
print(sapply(setdiff(names(df), myfind_chr_cols_df(df)),
function(col) sum(is.na(df[, col]))))
print(sprintf("numeric data w/ 0s in %s: ",
ifelse(!is.null(df_name <- comment(df)), df_name, "")))
print(sapply(setdiff(names(df), myfind_chr_cols_df(df)),
function(col) sum(df[, col] == 0, na.rm=TRUE)))
print(sprintf("numeric data w/ Infs in %s: ",
ifelse(!is.null(df_name <- comment(df)), df_name, "")))
print(sapply(setdiff(names(df), myfind_chr_cols_df(df)),
function(col) sum(df[, col] == Inf, na.rm=TRUE)))
print(sprintf("numeric data w/ NaNs in %s: ",
ifelse(!is.null(df_name <- comment(df)), df_name, "")))
print(sapply(setdiff(names(df), myfind_chr_cols_df(df)),
function(col) sum(df[, col] == NaN, na.rm=TRUE)))
print(sprintf("string data missing in %s: ",
ifelse(!is.null(df_name <- comment(df)), df_name, "")))
print(sapply(setdiff(myfind_chr_cols_df(df), ".src"),
function(col) sum(df[, col] == "")))
}
# Performed repeatedly in other chunks
glb_chk_data <- function() {
# Histogram of predictor in glb_trnent_df & glb_newent_df
print(myplot_histogram(glb_entity_df, glb_rsp_var_raw) + facet_wrap(~ .src))
if (glb_is_classification)
dsp_class_dstrb(var=ifelse(glb_rsp_var %in% names(glb_entity_df),
glb_rsp_var, glb_rsp_var_raw))
dsp_problem_data(glb_entity_df)
}
glb_chk_data()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Loading required package: reshape2
## Popular.0 Popular.1 Popular.NA
## Test NA NA 1870
## Train 5439 1093 NA
## Popular.0 Popular.1 Popular.NA
## Test NA NA 1
## Train 0.8326699 0.1673301 NA
## [1] "numeric data missing in glb_entity_df: "
## WordCount Popular UniqueID
## 0 1870 0
## [1] "numeric data w/ 0s in glb_entity_df: "
## WordCount Popular UniqueID
## 109 5439 0
## [1] "numeric data w/ Infs in glb_entity_df: "
## WordCount Popular UniqueID
## 0 0 0
## [1] "numeric data w/ NaNs in glb_entity_df: "
## WordCount Popular UniqueID
## 0 0 0
## [1] "string data missing in glb_entity_df: "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate
## 17 0
# Create new features that help diagnostics
if (!is.null(glb_map_rsp_raw_to_var)) {
glb_entity_df[, glb_rsp_var] <-
glb_map_rsp_raw_to_var(glb_entity_df[, glb_rsp_var_raw])
mycheck_map_results(mapd_df=glb_entity_df,
from_col_name=glb_rsp_var_raw, to_col_name=glb_rsp_var)
if (glb_is_classification) dsp_class_dstrb(glb_rsp_var)
}
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
## Loading required package: tcltk
## Popular Popular.fctr .n
## 1 0 N 5439
## 2 NA <NA> 1870
## 3 1 Y 1093
## Warning: Removed 1 rows containing missing values (position_stack).
## Popular.fctr.N Popular.fctr.Y Popular.fctr.NA
## Test NA NA 1870
## Train 5439 1093 NA
## Popular.fctr.N Popular.fctr.Y Popular.fctr.NA
## Test NA NA 1
## Train 0.8326699 0.1673301 NA
# Convert dates to numbers
# typically, dates come in as chars;
# so this must be done before converting chars to factors
myextract_dates_df <- function(df, vars) {
for (var in vars) {
dates_df <- data.frame(.date=strptime(df[, var], "%Y-%m-%d %H:%M:%S"))
dates_df[, paste0(var, ".year")] <- as.numeric(format(dates_df$.date, "%Y"))
dates_df[, paste0(var, ".month.fctr")] <- as.factor(format(dates_df$.date, "%m"))
dates_df[, paste0(var, ".date.fctr")] <- cut(as.numeric(format(dates_df$.date, "%d")), 5) # by week
dates_df[, paste0(var, ".wkday.fctr")] <- as.factor(format(dates_df$.date, "%w"))
dates_df[, paste0(var, ".hour")] <- as.numeric(format(dates_df$.date, "%H"))
dates_df[, paste0(var, ".apm.fctr")] <- as.factor(ifelse(dates_df[, paste0(var, ".hour")] < 12, "am", "pm"))
dates_df[, paste0(var, ".minute")] <- as.numeric(format(dates_df$.date, "%M"))
dates_df[, paste0(var, ".second")] <- as.numeric(format(dates_df$.date, "%S"))
}
#myprint_df(dates_df)
return(subset(dates_df, select=-.date))
}
if (!is.null(glb_date_vars)) {
glb_entity_df <- cbind(glb_entity_df,
myextract_dates_df(glb_entity_df, glb_date_vars))
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, glb_date_vars)
}
# check distribution of all numeric data
dsp_numeric_vars_dstrb <- function(vars_lst) {
for (var in vars_lst) {
gp <- myplot_box(df=glb_entity_df, ycol_names=var, xcol_name=glb_rsp_var)
if (inherits(glb_entity_df[, var], "factor"))
gp <- gp + facet_wrap(reformulate(var))
print(gp)
}
}
dsp_numeric_vars_dstrb(setdiff(names(glb_entity_df),
union(myfind_chr_cols_df(glb_entity_df),
c(glb_rsp_var_raw, glb_rsp_var))))
add_new_diag_feats <- function(obs_df, ref_df=glb_entity_df) {
require(plyr)
obs_df <- mutate(obs_df,
# <col_name>.NA=is.na(<col_name>),
# <col_name>.fctr=factor(<col_name>,
# as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))),
# <col_name>.fctr=relevel(factor(<col_name>,
# as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))),
# "<ref_val>"),
# <col2_name>.fctr=relevel(factor(ifelse(<col1_name> == <val>, "<oth_val>", "<ref_val>")),
# as.factor(c("R", "<ref_val>")),
# ref="<ref_val>"),
# This doesn't work - use sapply instead
# <col_name>.fctr_num=grep(<col_name>, levels(<col_name>.fctr)),
#
# Date.my=as.Date(strptime(Date, "%m/%d/%y %H:%M")),
# Year=year(Date.my),
# Month=months(Date.my),
# Weekday=weekdays(Date.my)
# <col_name>.log=log(1 + <col.name>),
WordCount.log = log(1 + WordCount),
# <col_name>=<table>[as.character(<col2_name>)],
# <col_name>=as.numeric(<col2_name>),
.rnorm=rnorm(n=nrow(obs_df))
)
# If levels of a factor are different across obs_df & glb_newent_df; predict.glm fails
# Transformations not handled by mutate
# obs_df$<col_name>.fctr.num <- sapply(1:nrow(obs_df),
# function(row_ix) grep(obs_df[row_ix, "<col_name>"],
# levels(obs_df[row_ix, "<col_name>.fctr"])))
#print(summary(obs_df))
#print(sapply(names(obs_df), function(col) sum(is.na(obs_df[, col]))))
return(obs_df)
}
# Add WordCount.log since WordCount is not distributed normally
glb_entity_df <- add_new_diag_feats(glb_entity_df)
## Loading required package: plyr
print("Replacing WordCount with WordCount.log in potential feature set")
## [1] "Replacing WordCount with WordCount.log in potential feature set"
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, "WordCount")
# Remove PubDate.year since all entity data is from 2014
# Remove PubDate.month.fctr since all newent data is from December
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
c("PubDate.year", "PubDate.month.fctr"))
# Check distributions of newly transformed / extracted vars
# Enhancement: remove vars that were displayed ealier
dsp_numeric_vars_dstrb(setdiff(names(glb_entity_df),
union(myfind_chr_cols_df(glb_entity_df),
union(glb_rsp_var_raw,
union(glb_rsp_var, glb_exclude_vars_as_features)))))
# Convert factors to dummy variables
# Build splines require(splines); bsBasis <- bs(training$age, df=3)
#pairs(subset(glb_trnent_df, select=-c(col_symbol)))
# Check for glb_newent_df & glb_trnent_df features range mismatches
# Other diagnostics:
# print(subset(glb_trnent_df, <col1_name> == max(glb_trnent_df$<col1_name>, na.rm=TRUE) &
# <col2_name> <= mean(glb_trnent_df$<col1_name>, na.rm=TRUE)))
# print(glb_trnent_df[which.max(glb_trnent_df$<col_name>),])
# print(<col_name>_freq_glb_trnent_df <- mycreate_tbl_df(glb_trnent_df, "<col_name>"))
# print(which.min(table(glb_trnent_df$<col_name>)))
# print(which.max(table(glb_trnent_df$<col_name>)))
# print(which.max(table(glb_trnent_df$<col1_name>, glb_trnent_df$<col2_name>)[, 2]))
# print(table(glb_trnent_df$<col1_name>, glb_trnent_df$<col2_name>))
# print(table(is.na(glb_trnent_df$<col1_name>), glb_trnent_df$<col2_name>))
# print(table(sign(glb_trnent_df$<col1_name>), glb_trnent_df$<col2_name>))
# print(mycreate_xtab_df(glb_trnent_df, <col1_name>))
# print(mycreate_xtab_df(glb_trnent_df, c(<col1_name>, <col2_name>)))
# print(<col1_name>_<col2_name>_xtab_glb_trnent_df <-
# mycreate_xtab_df(glb_trnent_df, c("<col1_name>", "<col2_name>")))
# <col1_name>_<col2_name>_xtab_glb_trnent_df[is.na(<col1_name>_<col2_name>_xtab_glb_trnent_df)] <- 0
# print(<col1_name>_<col2_name>_xtab_glb_trnent_df <-
# mutate(<col1_name>_<col2_name>_xtab_glb_trnent_df,
# <col3_name>=(<col1_name> * 1.0) / (<col1_name> + <col2_name>)))
# print(<col2_name>_min_entity_arr <-
# sort(tapply(glb_trnent_df$<col1_name>, glb_trnent_df$<col2_name>, min, na.rm=TRUE)))
# print(<col1_name>_na_by_<col2_name>_arr <-
# sort(tapply(glb_trnent_df$<col1_name>.NA, glb_trnent_df$<col2_name>, mean, na.rm=TRUE)))
# Other plots:
# print(myplot_box(df=glb_trnent_df, ycol_names="<col1_name>"))
# print(myplot_box(df=glb_trnent_df, ycol_names="<col1_name>", xcol_name="<col2_name>"))
# print(myplot_line(subset(glb_trnent_df, Symbol %in% c("KO", "PG")),
# "Date.my", "StockPrice", facet_row_colnames="Symbol") +
# geom_vline(xintercept=as.numeric(as.Date("2003-03-01"))) +
# geom_vline(xintercept=as.numeric(as.Date("1983-01-01")))
# )
# print(myplot_scatter(glb_entity_df, "<col1_name>", "<col2_name>", smooth=TRUE))
# print(myplot_scatter(glb_entity_df, "<col1_name>", "<col2_name>", colorcol_name="<Pred.fctr>") +
# geom_point(data=subset(glb_entity_df, <condition>),
# mapping=aes(x=<x_var>, y=<y_var>), color="red", shape=4, size=5))
glb_chunks_df <- myadd_chunk(glb_chunks_df, "cleanse.data", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 2 inspect.data 2 0 10.394 28.45 18.056
## 3 cleanse.data 2 1 28.451 NA NA
2.1: cleanse datadsp_problem_data(glb_entity_df)
## [1] "numeric data missing in : "
## WordCount Popular UniqueID
## 0 1870 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 1870 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 0 0
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 0 0
## WordCount.log .rnorm
## 0 0
## [1] "numeric data w/ 0s in : "
## WordCount Popular UniqueID
## 109 5439 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 0 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 378 159
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 1344 141
## WordCount.log .rnorm
## 109 0
## [1] "numeric data w/ Infs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 0 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 0 0
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 0 0
## WordCount.log .rnorm
## 0 0
## [1] "numeric data w/ NaNs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 0 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 0 0
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 0 0
## WordCount.log .rnorm
## 0 0
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate
## 17 0
warning("Forcing ", nrow(subset(glb_entity_df, WordCount.log == 0)),
" obs with WordCount.log 0s to NA")
## Warning: Forcing 109 obs with WordCount.log 0s to NA
glb_entity_df[glb_entity_df$WordCount.log == 0, "WordCount.log"] <- NA
dsp_problem_data(glb_entity_df)
## [1] "numeric data missing in : "
## WordCount Popular UniqueID
## 0 1870 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 1870 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 0 0
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 0 0
## WordCount.log .rnorm
## 109 0
## [1] "numeric data w/ 0s in : "
## WordCount Popular UniqueID
## 109 5439 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 0 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 378 159
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 1344 141
## WordCount.log .rnorm
## 0 0
## [1] "numeric data w/ Infs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 0 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 0 0
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 0 0
## WordCount.log .rnorm
## 0 0
## [1] "numeric data w/ NaNs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 0 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 0 0
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 0 0
## WordCount.log .rnorm
## 0 0
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate
## 17 0
dsp_catgs <- function() {
print("NewsDesk:")
print(table(glb_entity_df$NewsDesk))
print("SectionName:")
print(table(glb_entity_df$SectionName))
print("SubsectionName:")
print(table(glb_entity_df$SubsectionName))
}
sel_obs <- function(Popular=NULL,
NewsDesk=NULL, SectionName=NULL, SubsectionName=NULL,
Headline.contains=NULL, Snippet.contains=NULL, Abstract.contains=NULL,
Headline.pfx=NULL, NewsDesk.nb=NULL) {
tmp_entity_df <- glb_entity_df
# Does not work for Popular == NAs ???
if (!is.null(Popular)) {
if (is.na(Popular))
tmp_entity_df <- tmp_entity_df[is.na(tmp_entity_df$Popular), ] else
tmp_entity_df <- tmp_entity_df[tmp_entity_df$Popular == Popular, ]
}
if (!is.null(NewsDesk))
tmp_entity_df <- tmp_entity_df[tmp_entity_df$NewsDesk == NewsDesk, ]
if (!is.null(SectionName))
tmp_entity_df <- tmp_entity_df[tmp_entity_df$SectionName == SectionName, ]
if (!is.null(SubsectionName))
tmp_entity_df <- tmp_entity_df[tmp_entity_df$SubsectionName == SubsectionName, ]
if (!is.null(Headline.contains))
tmp_entity_df <-
tmp_entity_df[grep(Headline.contains, tmp_entity_df$Headline), ]
if (!is.null(Snippet.contains))
tmp_entity_df <-
tmp_entity_df[grep(Snippet.contains, tmp_entity_df$Snippet), ]
if (!is.null(Abstract.contains))
tmp_entity_df <-
tmp_entity_df[grep(Abstract.contains, tmp_entity_df$Abstract), ]
if (!is.null(Headline.pfx)) {
if (length(grep("Headline.pfx", names(tmp_entity_df), fixed=TRUE, value=TRUE))
> 0) tmp_entity_df <-
tmp_entity_df[tmp_entity_df$Headline.pfx == Headline.pfx, ] else
warning("glb_entity_df does not contain Headline.pfx; ignoring that filter")
}
if (!is.null(NewsDesk.nb)) {
if (any(grepl("NewsDesk.nb", names(tmp_entity_df), fixed=TRUE)) > 0)
tmp_entity_df <-
tmp_entity_df[tmp_entity_df$NewsDesk.nb == NewsDesk.nb, ] else
warning("glb_entity_df does not contain NewsDesk.nb; ignoring that filter")
}
return(glb_entity_df$UniqueID %in% tmp_entity_df$UniqueID)
}
dsp_obs <- function(..., cols=c(NULL), all=FALSE) {
tmp_df <- glb_entity_df[sel_obs(...),
union(c("UniqueID", "Popular", "Headline"), cols), FALSE]
if(all) { print(tmp_df) } else { myprint_df(tmp_df) }
}
#dsp_obs(Popular=1, NewsDesk="", SectionName="", Headline.contains="Boehner")
# dsp_obs(Popular=1, NewsDesk="", SectionName="")
# dsp_obs(Popular=NA, NewsDesk="", SectionName="")
dsp_tbl <- function(...) {
tmp_entity_df <- glb_entity_df[sel_obs(...), ]
tmp_tbl <- table(tmp_entity_df$NewsDesk,
tmp_entity_df$SectionName,
tmp_entity_df$SubsectionName,
tmp_entity_df$Popular, useNA="ifany")
#print(names(tmp_tbl))
#print(dimnames(tmp_tbl))
print(tmp_tbl)
}
#dsp_tbl(NewsDesk="", SectionName="", Headline.contains="Boehner")
dsp_hdlxtab <- function(str)
print(mycreate_sqlxtab_df(glb_entity_df[sel_obs(Headline.contains=str), ],
c("Headline.pfx", "Headline", glb_rsp_var)))
dsp_hdlxtab("(1914)|(1939)")
## "Headline.pfx"
## 1 Headline.pfx
## 2 Headline.pfx
## 3 Headline.pfx
## 4 Headline.pfx
## 5 Headline.pfx
## 6 Headline.pfx
## 7 Headline.pfx
## 8 Headline.pfx
## 9 Headline.pfx
## 10 Headline.pfx
## 11 Headline.pfx
## 12 Headline.pfx
## 13 Headline.pfx
## 14 Headline.pfx
## 15 Headline.pfx
## 16 Headline.pfx
## 17 Headline.pfx
## 18 Headline.pfx
## 19 Headline.pfx
## 20 Headline.pfx
## 21 Headline.pfx
## 22 Headline.pfx
## 23 Headline.pfx
## 24 Headline.pfx
## 25 Headline.pfx
## 26 Headline.pfx
## 27 Headline.pfx
## 28 Headline.pfx
## 29 Headline.pfx
## 30 Headline.pfx
## 31 Headline.pfx
## 32 Headline.pfx
## 33 Headline.pfx
## 34 Headline.pfx
## 35 Headline.pfx
## 36 Headline.pfx
## 37 Headline.pfx
## 38 Headline.pfx
## 39 Headline.pfx
## 40 Headline.pfx
## 41 Headline.pfx
## 42 Headline.pfx
## 43 Headline.pfx
## 44 Headline.pfx
## 45 Headline.pfx
## 46 Headline.pfx
## 47 Headline.pfx
## 48 Headline.pfx
## 49 Headline.pfx
## 50 Headline.pfx
## 51 Headline.pfx
## 52 Headline.pfx
## 53 Headline.pfx
## 54 Headline.pfx
## 55 Headline.pfx
## 56 Headline.pfx
## 57 Headline.pfx
## 58 Headline.pfx
## 59 Headline.pfx
## 60 Headline.pfx
## 61 Headline.pfx
## 62 Headline.pfx
## 63 Headline.pfx
## 64 Headline.pfx
## 65 Headline.pfx
## 66 Headline.pfx
## 67 Headline.pfx
## 68 Headline.pfx
## 69 Headline.pfx
## 70 Headline.pfx
## 71 Headline.pfx
## 72 Headline.pfx
## 73 Headline.pfx
## 74 Headline.pfx
## 75 Headline.pfx
## 76 Headline.pfx
## 77 Headline.pfx
## 78 Headline.pfx
## 79 Headline.pfx
## 80 Headline.pfx
## 81 Headline.pfx
## 82 Headline.pfx
## 83 Headline.pfx
## 84 Headline.pfx
## 85 Headline.pfx
## 86 Headline.pfx
## 87 Headline.pfx
## 88 Headline.pfx
## 89 Headline.pfx
## 90 Headline.pfx
## 91 Headline.pfx
## 92 Headline.pfx
## 93 Headline.pfx
## 94 Headline.pfx
## 95 Headline.pfx
## 96 Headline.pfx
## 97 Headline.pfx
## 98 Headline.pfx
## 99 Headline.pfx
## 100 Headline.pfx
## 101 Headline.pfx
## 102 Headline.pfx
## 103 Headline.pfx
## 104 Headline.pfx
## 105 Headline.pfx
## 106 Headline.pfx
## 107 Headline.pfx
## 108 Headline.pfx
## 109 Headline.pfx
## 110 Headline.pfx
## 111 Headline.pfx
## 112 Headline.pfx
## 113 Headline.pfx
## 114 Headline.pfx
## 115 Headline.pfx
## 116 Headline.pfx
## 117 Headline.pfx
## 118 Headline.pfx
## 119 Headline.pfx
## 120 Headline.pfx
## 121 Headline.pfx
## Headline
## 1 1914: Turkish Desire for War
## 2 1939: Letters Reaffirm City Ties
## 3 1939: Nazis' War Production Hurt
## 4 1914: 'City of Light' to Return
## 5 1914: 79 Die in Brutal German Raid
## 6 1914: Allies Advance in West Africa
## 7 1914: Antwerp Is Left in Flames
## 8 1914: Armored Train Surprise
## 9 1914: Belgians Flood Battlefield
## 10 1914: Big Guns From Pola to Defend Austrian Capital
## 11 1914: Bomb-Dropping Warplane Attacks Paris
## 12 1914: British Prime Minister Urges Irish to Enlist
## 13 1914: British Take Orange River
## 14 1914: British War Funds and Troops
## 15 1914: Christmas Shows in London
## 16 1914: Christmas at the Front
## 17 1914: Churches Of Ypres Are Targets For German Guns
## 18 1914: City Prepares for War Wounded
## 19 1914: Despite War, 'New York Herald' to Stay in Paris
## 20 1914: Fearing Espionage, Officials in London Search Travelers' Luggage
## 21 1914: France Silences Enemy Guns Near Arras
## 22 1914: France’s Champagne Prospects
## 23 1914: French Press Misjudges U.S. Ambassadors
## 24 1914: General De Wet Is Captured
## 25 1914: German Flags Hung in Invalides
## 26 1914: German Offensive Checked in Belgium
## 27 1914: Germans Attack Antwerp Forts
## 28 1914: Germans Bring Up Guns
## 29 1914: Germans Piloting Turkish Aeroplanes
## 30 1914: Germans Waste Ammunition
## 31 1914: Germany Attacks Tahiti
## 32 1914: Hospital-Ship Lost Off Coast
## 33 1914: Hotel Seizes Princess’s Art
## 34 1914: India's Millions Loyal to Core
## 35 1914: Indian Infantry Routs Turks
## 36 1914: Is Hairdresser a German Spy?
## 37 1914: Italian Mobilization Expected
## 38 1914: Italy's Foreign Minister Is Dead
## 39 1914: King George Addresses Army
## 40 1914: King Opens 'Khaki' Parliament
## 41 1914: M. Henri Pol Will Go On Caring for Paris Sparrows
## 42 1914: Naval Squadron Shells Belgium
## 43 1914: Paris Auto Trade Stalls
## 44 1914: Paris Theatres to Reopen
## 45 1914: Parisians Flock to Cemeteries on All Souls' Day to Honor Dead Soldiers
## 46 1914: Prince of Wales Passes Busy Day at the Front
## 47 1914: Prisoners Escape Paris Crowd
## 48 1914: Republicans Sweep Elections
## 49 1914: Royal Navy Stages Joint Sea and Air Strike on Cuxhaven
## 50 1914: Russian Army Scores Victory
## 51 1914: Russians Dominate in East Poland
## 52 1914: Scandinavian Alliance Formed
## 53 1914: Seizure of Oil Steamer Interests United States
## 54 1914: Sinking of the Nrnberg
## 55 1914: Sir Ernest Shackleton Outlines His Polar Projects
## 56 1914: Tipperary Song Is a Hit
## 57 1914: Turcos Drive Germans Out
## 58 1914: War May Rise Price of Hats in America
## 59 1914: Wounded May Go to Riviera
## 60 1914: Zeppelin Danger a Bluff
## 61 1939: "Stanley and Livingstone" Opens in Paris
## 62 1939: 'Nazi Spy' Named Best Film
## 63 1939: 5 Convicted in Stock Fraud
## 64 1939: 7,000,000 More Cars Predicted on U.S. Highways
## 65 1939: Advice on Heating Issued
## 66 1939: Allies Seize Contraband
## 67 1939: American Ships Hasten to Sail Before New Bill Takes Effect
## 68 1939: Australian Flyers Arrive
## 69 1939: British Unmask Reich Defenses
## 70 1939: Convict Who Fled Returns
## 71 1939: Crowds Return to Paris
## 72 1939: Eleanor Roosevelt Ready to Testify
## 73 1939: Empire Air Accord Is Signed
## 74 1939: Fighting on Western Front
## 75 1939: Film Industry Revives in Paris
## 76 1939: Finns Resist Soviet Aggression
## 77 1939: France Aims for ‘Total Peace’, Says Finance Minister
## 78 1939: France Extends Fortifications
## 79 1939: France Recalls War Premier Georges Clemenceau
## 80 1939: French Army in High Spirits
## 81 1939: French Ban Communism
## 82 1939: French Join Americans in Thanksgiving Rites
## 83 1939: French Occupy German Soil
## 84 1939: German Battleship Is Badly Damaged by British Cruisers
## 85 1939: German Troops Invade Poland
## 86 1939: Ginsberg Name Change Denied
## 87 1939: Greatest Opera Fears Eviction
## 88 1939: Hitler Appeals to God
## 89 1939: Hospital Designated Auxiliary
## 90 1939: Hungary to Defend Europe
## 91 1939: Jamming of BBC Radio Addressed
## 92 1939: Light at Night in Paris criticized
## 93 1939: Line of Demarcation Decided
## 94 1939: Louvre Hides Art in Vaults
## 95 1939: Mine Sinks Japanese Liner
## 96 1939: More Britons Called to Service
## 97 1939: Nazi Raiders Stir London
## 98 1939: Nazi Squadron Flees British
## 99 1939: Neither King Nor Soldier
## 100 1939: Poles Die Under Sovietization
## 101 1939: Polish Gold Reaches Paris
## 102 1939: Pont St. Louis Falls Into Seine
## 103 1939: Princess Louise Dies in London
## 104 1939: Radio Play Terrifies Hundreds
## 105 1939: Radio Station Charged
## 106 1939: Rain Quiets the Western Front
## 107 1939: Reich Maps Show France Partitioned, Says Daladier
## 108 1939: Ribbentrop Will Bear Guilt for Tragedy of War, Neville Chamberlain Says
## 109 1939: Roosevelt Signs Neutrality
## 110 1939: Second Meatless Day Is Named
## 111 1939: Sigmund Freud, Psychoanalyst, Dies Refugee in England at 83
## 112 1939: Sir Thomas Cullinan Dies
## 113 1939: Slovaks Are Terrorized
## 114 1939: Soviets Invade Poland
## 115 1939: Soviets Push Into China
## 116 1939: Textile Rationing Cards Issued
## 117 1939: Turks Sign Mutual Aid Pact With Allies
## 118 1939: Veterans Miss Fox Hunts
## 119 1939: War Inspires Letter-Writing
## 120 1939: War on Germany Declared
## 121 1939: Women Adopt Military Style
## Popular.fctr .n
## 1 N 1
## 2 N 1
## 3 N 1
## 4 N 1
## 5 <NA> 1
## 6 N 1
## 7 N 1
## 8 N 1
## 9 N 1
## 10 <NA> 1
## 11 N 1
## 12 N 1
## 13 N 1
## 14 N 1
## 15 <NA> 1
## 16 <NA> 1
## 17 N 1
## 18 N 1
## 19 N 1
## 20 N 1
## 21 <NA> 1
## 22 N 1
## 23 N 1
## 24 <NA> 1
## 25 N 1
## 26 N 1
## 27 N 1
## 28 N 1
## 29 N 1
## 30 N 1
## 31 N 1
## 32 N 1
## 33 <NA> 1
## 34 N 1
## 35 N 1
## 36 N 1
## 37 N 1
## 38 N 1
## 39 <NA> 1
## 40 N 1
## 41 N 1
## 42 N 1
## 43 N 1
## 44 N 1
## 45 N 1
## 46 N 1
## 47 N 1
## 48 N 1
## 49 <NA> 1
## 50 N 1
## 51 N 1
## 52 <NA> 1
## 53 N 1
## 54 <NA> 1
## 55 N 1
## 56 <NA> 1
## 57 N 1
## 58 N 1
## 59 <NA> 1
## 60 N 1
## 61 <NA> 1
## 62 <NA> 1
## 63 <NA> 1
## 64 N 1
## 65 N 1
## 66 N 1
## 67 N 1
## 68 <NA> 1
## 69 N 1
## 70 N 1
## 71 N 1
## 72 N 1
## 73 <NA> 1
## 74 N 1
## 75 N 1
## 76 N 1
## 77 N 1
## 78 <NA> 1
## 79 N 1
## 80 N 1
## 81 N 1
## 82 N 1
## 83 N 1
## 84 <NA> 1
## 85 N 1
## 86 <NA> 1
## 87 <NA> 1
## 88 <NA> 1
## 89 N 1
## 90 N 1
## 91 N 1
## 92 N 1
## 93 N 1
## 94 <NA> 1
## 95 N 1
## 96 N 1
## 97 N 1
## 98 N 1
## 99 <NA> 1
## 100 N 1
## 101 N 1
## 102 <NA> 1
## 103 <NA> 1
## 104 N 1
## 105 N 1
## 106 N 1
## 107 N 1
## 108 N 1
## 109 N 1
## 110 <NA> 1
## 111 N 1
## 112 N 1
## 113 N 1
## 114 N 1
## 115 N 1
## 116 N 1
## 117 N 1
## 118 N 1
## 119 N 1
## 120 N 1
## 121 N 1
dsp_catxtab <- function(str)
print(mycreate_sqlxtab_df(glb_entity_df[sel_obs(Headline.contains=str), ],
c("Headline.pfx", "NewsDesk", "SectionName", "SubsectionName", glb_rsp_var)))
dsp_catxtab("1914)|(1939)")
## "Headline.pfx" NewsDesk SectionName SubsectionName Popular.fctr .n
## 1 Headline.pfx Foreign N 48
## 2 Headline.pfx Foreign <NA> 13
## 3 Headline.pfx <NA> 2
dsp_catxtab("19(14|39|64):")
## "Headline.pfx" NewsDesk SectionName SubsectionName Popular.fctr .n
## 1 Headline.pfx Foreign N 138
## 2 Headline.pfx Foreign <NA> 39
## 3 Headline.pfx <NA> 4
## 4 Headline.pfx N 1
dsp_catxtab("19..:")
## "Headline.pfx" NewsDesk SectionName SubsectionName Popular.fctr .n
## 1 Headline.pfx Foreign N 141
## 2 Headline.pfx Foreign <NA> 39
## 3 Headline.pfx N 9
## 4 Headline.pfx <NA> 4
make_prefix <- function(row_ix) {
if (grepl("On This Day:", glb_entity_df[row_ix, "Headline"]))
return("On This Day::")
if (grepl("Reporter(*')s Notebook", glb_entity_df[row_ix, "Headline"]))
return("Reporter's Notebook::")
# 1/17 is Popular
#dsp_obs(Headline.contains="Quiz(.*)([?=|]|[?=:])", cols=c("NewsDesk.nb"))
if (grepl("Quiz(.*)([?=|]|[?=:])", glb_entity_df[row_ix, "Headline"]))
return("Quiz(.*)([?=|]|[?=:]::")
words <- unlist(strsplit(glb_entity_df[row_ix, "Headline"], "\\s+"))
words <- words[words != ""]
# Although 10/10 in trnent is Popular, all of them are contained in
# NewsDesk.nb == "Styles" & does not reduce the number of blogs
# with NewsDesk.nb == "myMisc::" & Popular = 1
#dsp_obs(Headline.contains="Quandary(.*)[?=:]", all=TRUE)
# ^ forces match only at the beginning of ths string; [0-9] matches any number
# All are matched to NewsDesk=[Foreign|]; SectionName=""; SubsectionName=""
# None are Popular
if (grepl("^19[0-9][0-9]:", words[1])) return("19[0-9][0-9]::")
# Only 9 of these & none are Popular
#if (grepl("NYTLNreads", words[1])) return("NYTLNreads::")
# 6/14 are Popular
if (grepl("Readers Respond", glb_entity_df[row_ix, "Headline"]))
return("Readers Respond::")
# 4/16 are Popular
if (grepl("Your Turn:", glb_entity_df[row_ix, "Headline"])) return("Your Turn::")
# 9/18 are Popular
if (grepl("Ask Well:", glb_entity_df[row_ix, "Headline"])) return("Ask Well::")
# Consolidate all ".*Fashion Week::" since all are Popular=0; &
# NewsDesk="TStyle|Styles|.*|Culture|Metro";
# SectionName= ".*|Arts |N.Y. / Region"; SubsectionName="";
if (grepl("Fashion Week", glb_entity_df[row_ix, "Headline"]))
return(".*Fashion Week::")
# Keep "Daily Clip Report" & "Daily Report" separate"
# None are Popular
# "Daily Clip Report" -> ""::""::""
# "Daily Report" -> "Business"::"Technology"::""
if (grepl("Daily Clip Report", glb_entity_df[row_ix, "Headline"]))
return("Daily Clip Report::")
if (grepl("Daily Report", glb_entity_df[row_ix, "Headline"]))
return("Daily Report::")
# Keep Today in Politics & Today in Small Business separate b/c
# Today in Small Business belongs to NewsDesk.nb=Business
if (grepl("Today in Politics", glb_entity_df[row_ix, "Headline"]))
return("Today in Politics::")
if (grepl("Today in Small Business", glb_entity_df[row_ix, "Headline"]))
return("Today in Small Business::")
if (grepl("Pictures of the (Day|Year|.)", glb_entity_df[row_ix, "Headline"]))
return("Pictures of the (Day|Year|.)::")
if (words[1] %in% c("Verbatim:"))
return(paste0(words[1], ":"))
if (words[1] %in% c("6", "A", "An", "At", "Daily", "First", "For", "From",
"How", "In",
"Morning", "Milan", "New", "Obama", "On",
"Paris", "Pictures", "Q.",
"Test", "The", "'The", "Today",
"What", "When", "Why", "Word")) {
words12 <- paste(words[1], words[2], collapse=" ")
if (words12 %in% c("Morning Agenda:"))
return(paste0(words12, ":"))
if (words12 %in% c("First Draft", "Test Yourself", "What We're"))
return(paste0(words12, "::"))
if (words12 %in% c("Word of")) return(paste0(words12, " the Day::"))
if (words12 %in% c("6 Q's")) return(paste0(words12, " About the News::"))
words123 <- paste(words12, words[3], collapse=" ")
if (words12 %in% c("New York")) {
if (words[3] == "Today:") return(paste0(words123, ":"))
return(words123)
}
if (words12 %in% c("The Daily")) {
if (words[3] %in% c("Gift:")) return(paste0(words12, " Gift::"))
stop("should not happen")
}
return(words12)
}
return(words[1])
}
make_prefix(187)
## [1] "19[0-9][0-9]::"
make_prefix(7984)
## [1] "Your Turn::"
# make_prefix(91)
glb_entity_df$Headline.pfx <- sapply(1:nrow(glb_entity_df), function(row_ix) make_prefix(row_ix))
#myprint_df(glb_entity_df[, c("Headline", "Headline.pfx")])
headline_pfx_df <- mycreate_sqlxtab_df(glb_entity_df[], c("Headline.pfx", glb_rsp_var))
#print(myplot_histogram(headline_pfx_df, ".n"))
print(myplot_hbar(head(headline_pfx_df, 15), "Headline.pfx", ".n",
colorcol_name=glb_rsp_var))
#print(head(orderBy(~-.n + Headline.pfx, headline_pfx_df), 20))
print(head(headline_pfx_df, 20))
## Headline.pfx Popular.fctr .n
## 1 .*Fashion Week:: N 184
## 2 19[0-9][0-9]:: N 150
## 3 Daily Clip Report:: N 62
## 4 Morning Agenda:: N 62
## 5 6 Q's About the News:: N 61
## 6 Test Yourself:: N 61
## 7 Word of the Day:: N 61
## 8 Daily Report:: N 60
## 9 New York Today:: N 59
## 10 First Draft:: N 58
## 11 Today in Small Business:: N 58
## 12 Pictures of the (Day|Year|.):: N 53
## 13 What We're:: N 45
## 14 Today in Politics:: N 44
## 15 19[0-9][0-9]:: <NA> 43
## 16 Verbatim:: N 33
## 17 The Daily Gift:: N 26
## 18 The Daily Gift:: <NA> 24
## 19 Pictures of the (Day|Year|.):: <NA> 23
## 20 Daily Clip Report:: <NA> 22
dsp_catxtab("Today in (Politics|Small Business)")
## Headline.pfx NewsDesk SectionName SubsectionName
## 1 Today in Small Business:: Business Business Day Small Business
## 2 Today in Politics::
## 3 Today in Politics::
## 4 Today in Small Business:: Business Business Day Small Business
## 5 Today in Small Business:: Business Day Small Business
## 6 Today in Small Business:: Business
## Popular.fctr .n
## 1 N 58
## 2 N 44
## 3 <NA> 21
## 4 <NA> 13
## 5 <NA> 1
## 6 <NA> 1
dsp_obs(Headline.contains="Today in .", all=TRUE)
## UniqueID Popular
## 73 73 0
## 162 162 0
## 260 260 0
## 356 356 0
## 510 510 0
## 607 607 0
## 719 719 0
## 800 800 0
## 883 883 0
## 1024 1024 0
## 1111 1111 0
## 1184 1184 0
## 1281 1281 0
## 1379 1379 0
## 1559 1559 0
## 1654 1654 0
## 1661 1661 0
## 1801 1801 0
## 1829 1829 0
## 1913 1913 0
## 2051 2051 0
## 2194 2194 0
## 2220 2220 0
## 2286 2286 0
## 2324 2324 0
## 2397 2397 0
## 2429 2429 0
## 2501 2501 0
## 2509 2509 0
## 2537 2537 0
## 2605 2605 0
## 2638 2638 0
## 2744 2744 0
## 2773 2773 0
## 2850 2850 0
## 2878 2878 0
## 2939 2939 0
## 2973 2973 0
## 3028 3028 0
## 3071 3071 0
## 3110 3110 0
## 3166 3166 0
## 3251 3251 0
## 3282 3282 0
## 3351 3351 0
## 3374 3374 0
## 3469 3469 0
## 3548 3548 0
## 3569 3569 0
## 3648 3648 0
## 3666 3666 0
## 3791 3791 0
## 3804 3804 0
## 3874 3874 0
## 3905 3905 0
## 3970 3970 0
## 4003 4003 0
## 4057 4057 0
## 4107 4107 0
## 4151 4151 0
## 4203 4203 0
## 4293 4293 0
## 4326 4326 0
## 4392 4392 0
## 4418 4418 0
## 4487 4487 0
## 4520 4520 0
## 4567 4567 0
## 4616 4616 0
## 4678 4678 0
## 4708 4708 0
## 4805 4805 0
## 4829 4829 0
## 4886 4886 0
## 4915 4915 0
## 4974 4974 0
## 4999 4999 0
## 5068 5068 0
## 5107 5107 0
## 5162 5162 0
## 5195 5195 0
## 5288 5288 0
## 5315 5315 0
## 5366 5366 0
## 5396 5396 0
## 5449 5449 0
## 5488 5488 0
## 5547 5547 0
## 5582 5582 0
## 5635 5635 0
## 5676 5676 0
## 5768 5768 0
## 5803 5803 0
## 5862 5862 0
## 5890 5890 0
## 5954 5954 0
## 5985 5985 0
## 6045 6045 0
## 6083 6083 0
## 6155 6155 0
## 6186 6186 0
## 6296 6296 0
## 6371 6371 0
## 6431 6431 0
## 6585 6585 NA
## 6617 6617 NA
## 6668 6668 NA
## 6705 6705 NA
## 6762 6762 NA
## 6797 6797 NA
## 6849 6849 NA
## 6889 6889 NA
## 6954 6954 NA
## 6986 6986 NA
## 7076 7076 NA
## 7104 7104 NA
## 7160 7160 NA
## 7191 7191 NA
## 7261 7261 NA
## 7294 7294 NA
## 7354 7354 NA
## 7396 7396 NA
## 7453 7453 NA
## 7483 7483 NA
## 7563 7563 NA
## 7597 7597 NA
## 7667 7667 NA
## 7688 7688 NA
## 7787 7787 NA
## 7813 7813 NA
## 7880 7880 NA
## 7906 7906 NA
## 7973 7973 NA
## 8002 8002 NA
## 8090 8090 NA
## 8147 8147 NA
## 8191 8191 NA
## 8309 8309 NA
## 8353 8353 NA
## 8397 8397 NA
## Headline
## 73 Today in Small Business: Made in the U.S.A.
## 162 Today in Small Business: The Coolest New Businesses in New York
## 260 Today in Small Business: Suppose Your Company Name Is Isis
## 356 Today in Small Business: Target and Starbucks Go Small
## 510 Today in Small Business: Twitter Tests a 'Buy' Button
## 607 Today in Small Business: Best and Worst Cities for Hispanic Entrepreneurs
## 719 Today in Small Business: Internet Slowdown
## 800 Today in Small Business: For New S.B.A. Chief, the Honeymoon May Be Over
## 883 Today in Small Business: Dying Malls
## 1024 Today in Small Business: 30 Start-Ups to Watch
## 1111 Today in Small Business: The Case Against Tipping
## 1184 Today in Small Business: When You Don't Love Your Company Name
## 1281 Today in Small Business: The World's Most Mysterious Nutella Emporium
## 1379 Today in Small Business: The Bacon Bowl
## 1559 Today in Small Business: How a Store Smells
## 1654 Today in Congressional Instagram: The Majority Leader Finds Bigfoot
## 1661 Today in Small Business: Why Jewelry Stores Hide the Price Tags
## 1801 Today in Small Business: A Positive Review on Yelp Goes Viral
## 1829 Today in Politics
## 1913 Today in Small Business: Mobile Is Not a Priority
## 2051 Today in Small Business: Unlimited Vacation
## 2194 Today in Small Business: Facebook Expands Its Ad Platform
## 2220 Today in Politics
## 2286 Today in Small Business: Paper or Plastic?
## 2324 Today in Politics
## 2397 Today in Small Business: 'Bloodletting' at Tony Hsieh's Start-Up Community
## 2429 Today in Politics
## 2501 Today in Small Business: The Coolest New Businesses in Brooklyn
## 2509 Today in Political #ThrowBackThursday: Bloomberg on Ice
## 2537 Today in Politics
## 2605 Today in Small Business: Hiring Picks Up
## 2638 Today in Politics
## 2744 Today in Small Business: Is the S.B.A. Going Silicon Valley?
## 2773 Today in Politics
## 2850 Today in Small Business: A Perfect Yelp Response
## 2878 Today in Politics
## 2939 Today in Small Business: The Bacon Boom
## 2973 Today in Politics
## 3028 Today in Small Business: When Hashtags Backfire
## 3071 Today in Politics
## 3110 Today in Small Business: the Rookie Cookie
## 3166 Today in Politics
## 3251 Today in Small Business: Why Amazon Must Be Stopped
## 3282 Today in Politics
## 3351 Today in Small Business: Business Travel and Ebola
## 3374 Today in Politics
## 3469 Today in Politics
## 3548 Today in Small Business: Forget Résumés. Try Videos
## 3569 Today in Politics
## 3648 Today in Small Business: Paying Retail Employees $50,000 a Year
## 3666 Today in Politics
## 3791 Today in Small Business: How Hackers Can Stick Businesses With Huge Phone Bills
## 3804 Today in Politics
## 3874 Today in Small Business: Is Apple Pay the Future of Money?
## 3905 Today in Politics
## 3970 Today in Small Business: A Lesson in Pricing
## 4003 Today in Politics
## 4057 Today in Small Business: 'We're the Uber of Whatever!'
## 4107 Today in Politics
## 4151 Today in Small Business: Dubious Excuses for Calling in Sick
## 4203 Today in Politics
## 4293 Today in Small Business: When the Ebola Virus Touches a Business
## 4326 Today in Politics
## 4392 Today in Small Business: Start-Ups With a Social Mission
## 4418 Today in Politics
## 4487 Today in Small Business: Daring to Close on Thanksgiving
## 4520 Today in Politics
## 4567 Today in Small Business: Jimmy Kimmel Pitches 'Shark Tank'
## 4616 Today in Politics
## 4678 Today in Small Business: The Halloween Industrial Complex
## 4708 Today in Politics
## 4805 Today in Small Business: 'The Yelp of Business Software'
## 4829 Today in Politics
## 4886 Today in Small Business: Minimum Wage and Marijuana
## 4915 Today in Politics
## 4974 Today in Small Business: Election Fallout
## 4999 Today in Politics
## 5068 Today in Small Business: Veteran-Owned Businesses
## 5107 Today in Politics
## 5162 Today in Small Business: Paternity Leave
## 5195 Today in Politics
## 5288 Today in Small Business: Start-Ups Founded by Women
## 5315 Today in Politics
## 5366 Today in Small Business: An S.E.O. Challenge
## 5396 Today in Politics
## 5449 Today in Small Business: Demise of the Internet Sales Tax
## 5488 Today in Politics
## 5547 Today in Small Business: Avoiding Bad Yelp Reviews
## 5582 Today in Politics
## 5635 Today in Small Business: 'Next Generation of Lender or Boiler Room?'
## 5676 Today in Politics
## 5768 Today in Small Business: How Costco Codes Its Prices
## 5803 Today in Politics
## 5862 Today in Small Business: 'Unrealistic Value Expectations'
## 5890 Today in Politics
## 5954 Today in Small Business: Pastry, Coffee and Cats
## 5985 Today in Politics
## 6045 Today in Small Business: Why Typewriters Are Coming Back
## 6083 Today in Politics
## 6155 Today in Small Business: 'Big Cannabis' Is Coming
## 6186 Today in Politics
## 6296 Today in Politics
## 6371 Today in Politics
## 6431 Today in Politics
## 6585 Today in Small Business: 'Mean People Fail'
## 6617 Today in Politics
## 6668 Today in Small Business: Advance Ticketing for Restaurants
## 6705 Today in Politics
## 6762 Today in Small Business: Pregnancy Discrimination
## 6797 Today in Politics
## 6849 Today in Small Business: Wages
## 6889 Today in Politics
## 6954 Today in Small Business: The Best Jobs Numbers in Years?
## 6986 Today in Politics
## 7076 Today in Small Business: Problems With Apple Pay?
## 7104 Today in Politics
## 7160 Today in Small Business: The Mistletoe Drone
## 7191 Today in Politics
## 7261 Today in Small Business: 'Um, I'm Selling the Business'
## 7294 Today in Politics
## 7354 Today in Small Business: The Best Start-Ups of 2014
## 7396 Today in Politics
## 7453 Today in Small Business: The Future of Payments
## 7483 Today in Politics
## 7563 Today in Small Business: A Retail Success for Instagram
## 7597 Today in Politics
## 7667 Today in Small Business: Yelp's Gift to Business Owners
## 7688 Today in Politics
## 7787 Today in Small Business: The Year's Best Franchises
## 7813 Today in Politics
## 7880 Today in Small Business: The Best Content Marketing Blogs
## 7906 Today in Politics
## 7973 Today in Small Business: Fracking and Gambling
## 8002 Today in Politics
## 8090 Today in Politics
## 8147 Today in Politics
## 8191 Today in Politics
## 8309 Today in Politics
## 8353 Today in Politics
## 8397 Today in Politics
sav_entity_df <- glb_entity_df
#glb_entity_df <- sav_entity_df
### Mine Headlines with myMisc for patterns & create a separate Headline.pfx category
glb_entity_df$Headline.pfx <- sapply(1:nrow(glb_entity_df), function(row_ix) {
if (all(tail(unlist(strsplit(glb_entity_df[row_ix, "Headline.pfx"], "")), 2)
== c(":", ":"))) return(glb_entity_df[row_ix, "Headline.pfx"])
# myFood:: has to come before myTech:: because of "Apple" conflict
if (grepl("Thanksgiving", glb_entity_df[row_ix, "Headline"])) return("myFood::")
if (grepl("Tech |Tech$|Techn[^i]", glb_entity_df[row_ix, "Headline"]))
return("myTech::")
if (grepl("Apple|Firefox|Google|Microsoft|Robot|Yahoo",
glb_entity_df[row_ix, "Headline"]))
return("myTech::")
return("myMisc::")
})
#print(glb_entity_df[sel_obs(Headline.pfx="myTech::"), c("UniqueID", "Headline")])
#nrow(glb_entity_df[sel_obs(Headline.pfx="myTech::"), c("UniqueID", "Headline")])
print(mycreate_sqlxtab_df(glb_entity_df, "Headline.pfx"))
## Headline.pfx .n
## 1 myMisc:: 6736
## 2 19[0-9][0-9]:: 193
## 3 .*Fashion Week:: 186
## 4 myTech:: 154
## 5 Daily Clip Report:: 84
## 6 New York Today:: 83
## 7 Daily Report:: 78
## 8 Morning Agenda:: 78
## 9 Test Yourself:: 78
## 10 Word of the Day:: 78
## 11 6 Q's About the News:: 76
## 12 Pictures of the (Day|Year|.):: 76
## 13 Today in Small Business:: 73
## 14 First Draft:: 72
## 15 Today in Politics:: 65
## 16 What We're:: 57
## 17 The Daily Gift:: 50
## 18 Verbatim:: 45
## 19 myFood:: 34
## 20 Readers Respond:: 23
## 21 Ask Well:: 18
## 22 On This Day:: 18
## 23 Quiz(.*)([?=|]|[?=:]:: 17
## 24 Your Turn:: 16
## 25 Reporter's Notebook:: 14
dsp_datagrp <- function(..., from=1, to=10, all=FALSE) {
print((dsp_df <- orderBy(~-.n +Headline.pfx+NewsDesk+SectionName+SubsectionName,
mycreate_sqlxtab_df(glb_entity_df[sel_obs(...), ],
c("Headline.pfx", "NewsDesk", "SectionName", "SubsectionName",
glb_rsp_var))))[ifelse(all, 1, from):
ifelse(all, nrow(dsp_df), min(to, nrow(dsp_df))), ])
}
dsp_datagrp(all=TRUE)
## Headline.pfx NewsDesk SectionName
## 1 myMisc::
## 2 myMisc:: Business Business Day
## 3 myMisc:: Culture Arts
## 4 myMisc:: TStyle
## 5 myMisc:: OpEd Opinion
## 6 myMisc:: Business Business Day
## 7 myMisc::
## 8 myMisc:: Foreign World
## 9 myMisc:: Culture Arts
## 10 myMisc:: Business Technology
## 11 19[0-9][0-9]:: Foreign
## 12 myMisc:: OpEd Opinion
## 13 .*Fashion Week:: TStyle
## 14 myMisc:: U.S.
## 15 myMisc:: Metro N.Y. / Region
## 16 myMisc:: Travel Travel
## 17 myMisc:: Science Health
## 18 myMisc::
## 19 myMisc:: OpEd Opinion
## 20 myMisc:: Business Crosswords/Games
## 21 myMisc:: Styles U.S.
## 22 myMisc:: Multimedia
## 23 myMisc:: Business Business Day
## 24 myMisc:: TStyle
## 25 myMisc:: Business Business Day
## 26 myMisc:: Business Technology
## 27 myMisc:: Culture
## 28 myMisc:: Styles
## 29 myMisc:: Styles U.S.
## 30 myTech:: Business Technology
## 31 Daily Clip Report::
## 32 Morning Agenda:: Business Business Day
## 33 6 Q's About the News:: U.S.
## 34 Test Yourself:: U.S.
## 35 Word of the Day:: U.S.
## 36 Daily Report:: Business Technology
## 38 myMisc:: Opinion
## 39 myMisc:: Styles U.S.
## 37 New York Today:: Metro N.Y. / Region
## 40 First Draft::
## 41 Today in Small Business:: Business Business Day
## 42 myMisc:: Foreign World
## 43 Pictures of the (Day|Year|.):: Multimedia
## 44 myMisc:: Science Health
## 45 myMisc:: Science Health
## 46 myMisc:: Culture Arts
## 47 .*Fashion Week:: Styles
## 48 Today in Politics::
## 49 myMisc:: Metro N.Y. / Region
## 50 What We're::
## 51 19[0-9][0-9]:: Foreign
## 52 myMisc:: Business Crosswords/Games
## 53 myMisc:: U.S.
## 54 Verbatim::
## 55 myMisc:: Business Technology
## 56 myMisc:: Travel Travel
## 57 myMisc:: Multimedia
## 58 myMisc:: Magazine Magazine
## 59 The Daily Gift:: TStyle
## 60 myMisc:: Business Business Day
## 61 Daily Clip Report::
## 64 myMisc:: Foreign
## 62 Pictures of the (Day|Year|.):: Multimedia
## 63 The Daily Gift:: TStyle
## 65 Today in Politics::
## 67 myMisc:: Opinion
## 66 New York Today:: Metro N.Y. / Region
## 68 myMisc:: Business Crosswords/Games
## 69 Daily Report:: Business Technology
## 70 myTech:: Business Technology
## 71 myTech:: Business Technology
## 74 myMisc:: OpEd
## 72 Test Yourself:: U.S.
## 73 Word of the Day:: U.S.
## 75 Morning Agenda:: Business Business Day
## 76 myMisc:: Opinion
## 77 6 Q's About the News:: U.S.
## 79 myFood:: Science Health
## 78 On This Day::
## 80 First Draft::
## 81 myTech:: Business Business Day
## 83 myMisc:: Metro N.Y. / Region
## 84 myMisc:: Styles
## 82 Today in Small Business:: Business Business Day
## 87 myMisc:: Business Day
## 85 Quiz(.*)([?=|]|[?=:]:: U.S.
## 86 Verbatim::
## 89 myMisc:: Arts
## 88 What We're::
## 90 myMisc:: Opinion
## 91 19[0-9][0-9]::
## 92 Ask Well:: Science Health
## 94 myMisc:: Foreign World
## 93 Your Turn:: Styles U.S.
## 95 myMisc:: TStyle
## 97 myMisc:: Foreign
## 96 Reporter's Notebook::
## 98 Ask Well:: Science Health
## 99 Readers Respond::
## 100 Readers Respond::
## 101 Reporter's Notebook::
## 102 myMisc:: Opinion
## 103 myMisc:: Business Business Day
## 104 myTech:: Business Business Day
## 105 19[0-9][0-9]::
## 109 myMisc:: Crosswords/Games
## 110 myMisc:: Open
## 111 myMisc:: Opinion
## 112 myMisc:: Travel
## 113 myTech:: Business
## 106 Readers Respond::
## 107 What We're:: OpEd Opinion
## 108 Your Turn:: Styles U.S.
## 114 Ask Well:: Science Health
## 118 myFood:: U.S.
## 119 myFood:: Styles U.S.
## 120 myMisc:: Business Day
## 121 myMisc:: Foreign World
## 122 myMisc:: Magazine Magazine
## 123 myTech::
## 124 myTech:: U.S.
## 125 myTech:: Business Business Day
## 115 New York Today:: Metro N.Y. / Region
## 116 On This Day::
## 117 Quiz(.*)([?=|]|[?=:]:: U.S.
## 129 myFood:: Metro N.Y. / Region
## 130 myFood:: Styles U.S.
## 131 myMisc:: Multimedia
## 132 myMisc:: N.Y. / Region
## 133 myMisc:: Opinion
## 134 myMisc:: National
## 135 myMisc:: National U.S.
## 136 myMisc:: Science
## 137 myMisc:: Science
## 138 myMisc:: Styles Style
## 139 myTech:: Opinion
## 140 myTech:: Culture Arts
## 141 myTech:: Culture Arts
## 142 myTech:: OpEd Opinion
## 143 myTech:: Styles
## 126 Readers Respond:: Opinion
## 127 The Daily Gift::
## 128 Your Turn:: Styles U.S.
## 144 .*Fashion Week::
## 145 .*Fashion Week:: Culture Arts
## 146 .*Fashion Week:: Metro N.Y. / Region
## 147 .*Fashion Week:: Styles
## 163 myFood::
## 164 myFood:: Business Crosswords/Games
## 165 myFood:: Business Technology
## 166 myFood:: Culture Arts
## 167 myFood:: OpEd Opinion
## 168 myFood:: OpEd Opinion
## 169 myFood:: Science Health
## 171 myFood:: Travel Travel
## 170 myFood:: TStyle
## 172 myMisc:: Business Day
## 173 myMisc:: Crosswords/Games
## 174 myMisc:: Health
## 175 myMisc:: Open
## 176 myMisc:: Opinion
## 177 myMisc:: Technology
## 178 myMisc:: Travel
## 179 myMisc:: U.S.
## 180 myMisc:: U.S.
## 181 myMisc:: World
## 182 myMisc:: Culture
## 183 myMisc:: Foreign World
## 184 myMisc:: OpEd
## 185 myMisc:: Sports
## 186 myMisc:: Sports Sports
## 187 myMisc:: Styles
## 188 myMisc:: Styles Health
## 189 myMisc:: Travel Travel
## 190 myTech:: Business Day
## 191 myTech:: U.S.
## 192 myTech:: Business
## 193 myTech:: Business Business Day
## 194 myTech:: Business Business Day
## 195 myTech:: Business Crosswords/Games
## 196 myTech:: OpEd
## 197 myTech:: OpEd Opinion
## 198 myTech:: Science Health
## 199 myTech:: Styles
## 200 myTech:: TStyle
## 201 myTech:: TStyle
## 148 New York Today:: N.Y. / Region
## 149 Pictures of the (Day|Year|.):: Metro N.Y. / Region
## 150 Quiz(.*)([?=|]|[?=:]::
## 151 Quiz(.*)([?=|]|[?=:]::
## 152 Readers Respond:: Opinion
## 153 Readers Respond:: Business Business Day
## 154 Readers Respond:: Magazine Magazine
## 155 Readers Respond:: Metro N.Y. / Region
## 156 Readers Respond:: OpEd Opinion
## 157 Reporter's Notebook::
## 158 The Daily Gift::
## 159 Today in Small Business:: Business Day
## 160 Today in Small Business:: Business
## 161 What We're:: OpEd Opinion
## 162 Your Turn:: U.S.
## SubsectionName Popular.fctr .n
## 1 N 889
## 2 Dealbook N 787
## 3 N 622
## 4 N 552
## 5 Y 403
## 6 Dealbook <NA> 272
## 7 <NA> 241
## 8 Asia Pacific N 200
## 9 <NA> 160
## 10 N 155
## 11 N 141
## 12 <NA> 140
## 13 N 136
## 14 Education N 124
## 15 N 119
## 16 N 114
## 17 Y 109
## 18 Y 107
## 19 N 107
## 20 Y 101
## 21 Y 94
## 22 N 86
## 23 Dealbook Y 83
## 24 <NA> 83
## 25 Small Business N 75
## 26 <NA> 75
## 27 <NA> 70
## 28 N 68
## 29 N 65
## 30 N 64
## 31 N 62
## 32 Dealbook N 62
## 33 Education N 61
## 34 Education N 61
## 35 Education N 61
## 36 N 60
## 38 Room For Debate N 59
## 39 <NA> 59
## 37 N 59
## 40 N 58
## 41 Small Business N 58
## 42 Asia Pacific <NA> 55
## 43 N 53
## 44 <NA> 52
## 45 N 51
## 46 Y 50
## 47 N 46
## 48 N 44
## 49 <NA> 43
## 50 N 41
## 51 <NA> 39
## 52 <NA> 38
## 53 Education <NA> 36
## 54 N 33
## 55 Y 32
## 56 <NA> 31
## 57 <NA> 30
## 58 N 30
## 59 N 25
## 60 Small Business <NA> 23
## 61 <NA> 22
## 64 N 22
## 62 <NA> 22
## 63 <NA> 22
## 65 <NA> 21
## 67 Room For Debate <NA> 20
## 66 <NA> 20
## 68 N 19
## 69 <NA> 18
## 70 <NA> 18
## 71 Y 18
## 74 <NA> 17
## 72 Education <NA> 17
## 73 Education <NA> 17
## 75 Dealbook <NA> 16
## 76 The Public Editor Y 16
## 77 Education <NA> 15
## 79 N 15
## 78 N 15
## 80 <NA> 14
## 81 Dealbook N 14
## 83 Y 13
## 84 <NA> 13
## 82 Small Business <NA> 13
## 87 Dealbook <NA> 12
## 85 Education N 12
## 86 <NA> 12
## 89 <NA> 11
## 88 <NA> 11
## 90 The Public Editor <NA> 10
## 91 N 9
## 92 Y 9
## 94 N 9
## 93 N 9
## 95 Y 8
## 97 <NA> 7
## 96 <NA> 7
## 98 N 6
## 99 N 6
## 100 Y 6
## 101 N 6
## 102 <NA> 5
## 103 Small Business Y 5
## 104 Dealbook Y 5
## 105 <NA> 4
## 109 <NA> 4
## 110 N 4
## 111 The Public Editor N 4
## 112 <NA> 4
## 113 N 4
## 106 <NA> 4
## 107 N 4
## 108 Y 4
## 114 <NA> 3
## 118 Education N 3
## 119 N 3
## 120 Small Business <NA> 3
## 121 Asia Pacific Y 3
## 122 <NA> 3
## 123 N 3
## 124 Education N 3
## 125 Dealbook <NA> 3
## 115 Y 3
## 116 <NA> 3
## 117 Education <NA> 3
## 129 N 2
## 130 Y 2
## 131 Y 2
## 132 <NA> 2
## 133 N 2
## 134 N 2
## 135 Politics N 2
## 136 <NA> 2
## 137 Y 2
## 138 Fashion & Style N 2
## 139 Room For Debate N 2
## 140 <NA> 2
## 141 N 2
## 142 Y 2
## 143 N 2
## 126 N 2
## 127 <NA> 2
## 128 <NA> 2
## 144 N 1
## 145 <NA> 1
## 146 N 1
## 147 <NA> 1
## 163 N 1
## 164 Y 1
## 165 N 1
## 166 N 1
## 167 N 1
## 168 Y 1
## 169 Y 1
## 171 N 1
## 170 N 1
## 172 Small Business N 1
## 173 N 1
## 174 Y 1
## 175 <NA> 1
## 176 Room For Debate Y 1
## 177 <NA> 1
## 178 N 1
## 179 <NA> 1
## 180 N 1
## 181 Asia Pacific <NA> 1
## 182 N 1
## 183 <NA> 1
## 184 Y 1
## 185 N 1
## 186 N 1
## 187 Y 1
## 188 N 1
## 189 Y 1
## 190 Dealbook <NA> 1
## 191 Education <NA> 1
## 192 Y 1
## 193 Small Business <NA> 1
## 194 Small Business N 1
## 195 Y 1
## 196 <NA> 1
## 197 N 1
## 198 N 1
## 199 <NA> 1
## 200 N 1
## 201 Y 1
## 148 <NA> 1
## 149 <NA> 1
## 150 <NA> 1
## 151 Y 1
## 152 Y 1
## 153 Dealbook N 1
## 154 N 1
## 155 Y 1
## 156 Y 1
## 157 Y 1
## 158 N 1
## 159 Small Business <NA> 1
## 160 <NA> 1
## 161 <NA> 1
## 162 <NA> 1
glb_entity_df[, "NewsDesk.nb"] <- sapply(1:nrow(glb_entity_df), function(rix) {
if (glb_entity_df[rix, "Headline.pfx"] %in%
c("Quiz(.*)([?=|]|[?=:]::", "On This Day::"))
return("myEducation")
if (glb_entity_df[rix, "Headline.pfx"] == ".*Fashion Week::") return("TStyle")
if (glb_entity_df[rix, "Headline.pfx"] == "Pictures of the (Day|Year|.)::")
return("myMultimedia")
if (glb_entity_df[rix, "SectionName"] %in%
c("Business Day", "Crosswords/Games", "Technology")) return("Business")
if (glb_entity_df[rix, "SectionName"] == "Arts") return("Culture")
if (glb_entity_df[rix, "SectionName"] == "World") return("Foreign")
if ((glb_entity_df[rix, "SectionName"] == "N.Y. / Region") &
!(glb_entity_df[rix, "Headline.pfx"] %in%
c(".*Fashion Week::", "Pictures of the (Day|Year|.)::")))
return("Metro")
if (glb_entity_df[rix, "SectionName"] == "Multimedia") return("myMultimedia")
if (glb_entity_df[rix, "SectionName"] == "Opinion") return("OpEd")
if (glb_entity_df[rix, "SectionName"] == "Health") return("Science")
if ((str <- glb_entity_df[rix, "NewsDesk"]) != "") return(str)
if (glb_entity_df[rix, "SubsectionName"] == "Education") return("myEducation")
if (glb_entity_df[rix, "Headline.pfx"] == "19[0-9][0-9]::") return("Foreign")
if (glb_entity_df[rix, "Headline.pfx"] == "What We're::") return("OpEd")
if (glb_entity_df[rix, "Headline.pfx"] == "Your Turn::") return("Styles")
if (glb_entity_df[rix, "Headline.pfx"] == "myFood::") return("Styles")
if (glb_entity_df[rix, "Headline.pfx"] == "myTech::") return("Business")
return(glb_entity_df[rix, "Headline.pfx"])
})
mycheck_map_results(glb_entity_df, "NewsDesk", "NewsDesk.nb", print.all=TRUE)
## NewsDesk NewsDesk.nb .n
## 1 Business Business 2026
## 2 myMisc:: 1249
## 3 Culture Culture 908
## 4 TStyle TStyle 829
## 5 OpEd OpEd 680
## 6 Foreign Foreign 477
## 7 myEducation 434
## 8 Styles Styles 325
## 9 Metro Metro 260
## 10 Science Science 251
## 11 myMultimedia 193
## 12 OpEd 174
## 13 Travel Travel 147
## 14 Daily Clip Report:: 84
## 15 First Draft:: 72
## 16 Today in Politics:: 65
## 17 Styles TStyle 47
## 18 Verbatim:: 45
## 19 Magazine Magazine 34
## 20 Business 27
## 21 Readers Respond:: 16
## 22 Foreign 14
## 23 Reporter's Notebook:: 14
## 24 Culture 11
## 25 National National 4
## 26 Metro 3
## 27 The Daily Gift:: 3
## 28 Styles 2
## 29 Sports Sports 2
## 30 Science 1
## 31 TStyle 1
## 32 Culture TStyle 1
## 33 Metro TStyle 1
## 34 Metro myMultimedia 1
## 35 Styles Science 1
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, "NewsDesk")
glb_entity_df[, "SectionName.nb"] <- sapply(1:nrow(glb_entity_df), function(rix) {
if (glb_entity_df[rix, "Headline.pfx"] == "Today in Small Business::")
return("Business Day")
if (glb_entity_df[rix, "Headline.pfx"] == ".*Fashion Week::") return("TStyle")
if (glb_entity_df[rix, "Headline.pfx"] %in%
c("On This Day::", "Quiz(.*)([?=|]|[?=:]::")) return("U.S.")
if (glb_entity_df[rix, "Headline.pfx"] == "19[0-9][0-9]::") return("World")
str <- glb_entity_df[rix, "SectionName"]
if (str == "Style") return("Styles")
if (str != "") return (str)
if (glb_entity_df[rix, "NewsDesk"] == "OpEd") return("Opinion")
if (glb_entity_df[rix, "NewsDesk.nb"] == "OpEd") return("Opinion")
if (glb_entity_df[rix, "NewsDesk"] == "Science") return("Health")
if (glb_entity_df[rix, "NewsDesk"] == "Foreign") return("World")
if (glb_entity_df[rix, "Headline.pfx"] == "myTech::") return("Technology")
return(glb_entity_df[rix, "NewsDesk.nb"])
})
mycheck_map_results(glb_entity_df, "SectionName", "SectionName.nb", print.all=TRUE)
## SectionName SectionName.nb .n
## 1 Business Day Business Day 1437
## 2 myMisc:: 1237
## 3 TStyle 875
## 4 Arts Arts 848
## 5 Opinion Opinion 783
## 6 U.S. U.S. 657
## 7 Technology Technology 442
## 8 World World 269
## 9 N.Y. / Region N.Y. / Region 264
## 10 Health Health 249
## 11 World 222
## 12 Multimedia Multimedia 193
## 13 Crosswords/Games Crosswords/Games 165
## 14 Travel Travel 152
## 15 Daily Clip Report:: 84
## 16 Styles 83
## 17 First Draft:: 72
## 18 Culture 71
## 19 Opinion 71
## 20 Today in Politics:: 65
## 21 Verbatim:: 45
## 22 Magazine Magazine 34
## 23 U.S. 20
## 24 Readers Respond:: 16
## 25 Reporter's Notebook:: 14
## 26 Technology 13
## 27 Open Open 5
## 28 Health 4
## 29 The Daily Gift:: 3
## 30 National 2
## 31 Style Styles 2
## 32 Business Day 1
## 33 Sports 1
## 34 Arts TStyle 1
## 35 N.Y. / Region TStyle 1
## 36 Sports Sports 1
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, "SectionName")
glb_entity_df[, "SubsectionName.nb"] <- sapply(1:nrow(glb_entity_df), function(rix) {
if ((str <- glb_entity_df[rix, "SubsectionName"]) != "") return(str)
if (glb_entity_df[rix, "NewsDesk.nb"] == "Styles") return("Fashion & Style")
if (glb_entity_df[rix, "NewsDesk.nb"] == "myEducation") return("Education")
if (glb_entity_df[rix, "Headline.pfx"] == "Today in Small Business::")
return("Small Business")
str <- paste(glb_entity_df[rix, "NewsDesk.nb"],
glb_entity_df[rix, "SectionName.nb"],
sep=ifelse(grepl("::$", glb_entity_df[rix, "NewsDesk.nb"]), "", "::"))
return(str)
})
mycheck_map_results(glb_entity_df, "SubsectionName", "SubsectionName.nb")
## SubsectionName SubsectionName.nb .n
## 1 Dealbook Dealbook 1256
## 2 myMisc::myMisc:: 1237
## 3 TStyle::TStyle 877
## 4 Culture::Arts 848
## 5 OpEd::Opinion 742
## 6 Business::Technology 450
## SubsectionName SubsectionName.nb .n
## 14 Small Business Small Business 181
## 19 First Draft::First Draft:: 72
## 23 Magazine::Magazine 34
## 29 myMisc::Travel 5
## 34 myMisc::U.S. 2
## 38 myMultimedia::N.Y. / Region 1
## SubsectionName SubsectionName.nb .n
## 33 TStyle::Technology 2
## 34 myMisc::U.S. 2
## 35 Fashion & Style Fashion & Style 2
## 36 Politics Politics 2
## 37 Small Business 1
## 38 myMultimedia::N.Y. / Region 1
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, "SubsectionName")
hdlpfx_xtab_df <- orderBy(reformulate(c("Headline.pfx", glb_rsp_var, "-", ".n")),
mycreate_sqlxtab_df(glb_entity_df,
c("Headline.pfx", glb_rsp_var)))
myprint_df(hdlpfx_xtab_df)
## Headline.pfx Popular.fctr .n
## 4 .*Fashion Week:: N 184
## 56 .*Fashion Week:: <NA> 2
## 5 19[0-9][0-9]:: N 150
## 19 19[0-9][0-9]:: <NA> 43
## 9 6 Q's About the News:: N 61
## 34 6 Q's About the News:: <NA> 15
## Headline.pfx Popular.fctr .n
## 27 Daily Clip Report:: <NA> 22
## 14 First Draft:: N 58
## 53 New York Today:: Y 3
## 26 Pictures of the (Day|Year|.):: <NA> 23
## 10 Test Yourself:: N 61
## 32 Word of the Day:: <NA> 17
## Headline.pfx Popular.fctr .n
## 40 What We're:: <NA> 12
## 11 Word of the Day:: N 61
## 32 Word of the Day:: <NA> 17
## 44 Your Turn:: N 9
## 51 Your Turn:: Y 4
## 55 Your Turn:: <NA> 3
newsdesk_xtab_df <- orderBy(reformulate(
c("NewsDesk.nb", "NewsDesk",
"Headline.pfx", glb_rsp_var, "-", ".n")),
mycreate_sqlxtab_df(glb_entity_df,
c("NewsDesk.nb", "NewsDesk",
"Headline.pfx", glb_rsp_var)))
myprint_df(newsdesk_xtab_df)
## NewsDesk.nb NewsDesk Headline.pfx Popular.fctr .n
## 100 Business myMisc:: N 2
## 59 Business myMisc:: <NA> 20
## 91 Business myTech:: N 3
## 115 Business myTech:: <NA> 1
## 114 Business Today in Small Business:: <NA> 1
## 33 Business Business Daily Report:: N 60
## NewsDesk.nb NewsDesk Headline.pfx Popular.fctr .n
## 117 Business Business myFood:: N 1
## 86 National National myMisc:: N 4
## 105 OpEd Readers Respond:: N 2
## 127 OpEd OpEd myFood:: N 1
## 129 OpEd OpEd myTech:: <NA> 1
## 66 Science Science myFood:: N 15
## NewsDesk.nb NewsDesk Headline.pfx Popular.fctr .n
## 144 TStyle TStyle myTech:: N 1
## 145 TStyle TStyle myTech:: Y 1
## 53 TStyle TStyle The Daily Gift:: N 25
## 56 TStyle TStyle The Daily Gift:: <NA> 22
## 48 Verbatim:: Verbatim:: N 33
## 72 Verbatim:: Verbatim:: <NA> 12
ndsection_xtab_df <- orderBy(reformulate(
c("NewsDesk.nb", "NewsDesk", "SectionName.nb", "SectionName",
"Headline.pfx", glb_rsp_var, "-", ".n")),
mycreate_sqlxtab_df(glb_entity_df,
c("NewsDesk.nb", "NewsDesk", "SectionName.nb", "SectionName",
"Headline.pfx", glb_rsp_var)))
print(ndsection_xtab_df)
## NewsDesk.nb NewsDesk SectionName.nb SectionName
## 136 Business Business Day Business Day
## 75 Business Business Day Business Day
## 137 Business Business Day Business Day
## 135 Business Business Day Business Day
## 138 Business Crosswords/Games Crosswords/Games
## 99 Business Crosswords/Games Crosswords/Games
## 108 Business Technology
## 139 Business Technology Technology
## 140 Business Business Business Day
## 31 Business Business Business Day Business Day
## 74 Business Business Business Day Business Day
## 2 Business Business Business Day Business Day
## 22 Business Business Business Day Business Day
## 6 Business Business Business Day Business Day
## 76 Business Business Business Day Business Day
## 98 Business Business Business Day Business Day
## 100 Business Business Business Day Business Day
## 141 Business Business Business Day Business Day
## 39 Business Business Business Day Business Day
## 81 Business Business Business Day Business Day
## 142 Business Business Crosswords/Games Crosswords/Games
## 66 Business Business Crosswords/Games Crosswords/Games
## 20 Business Business Crosswords/Games Crosswords/Games
## 51 Business Business Crosswords/Games Crosswords/Games
## 143 Business Business Crosswords/Games Crosswords/Games
## 101 Business Business Technology
## 144 Business Business Technology
## 36 Business Business Technology Technology
## 67 Business Business Technology Technology
## 145 Business Business Technology Technology
## 10 Business Business Technology Technology
## 55 Business Business Technology Technology
## 25 Business Business Technology Technology
## 30 Business Business Technology Technology
## 69 Business Business Technology Technology
## 68 Business Business Technology Technology
## 86 Culture Arts Arts
## 146 Culture Culture Arts Arts
## 3 Culture Culture Arts Arts
## 45 Culture Culture Arts Arts
## 9 Culture Culture Arts Arts
## 119 Culture Culture Arts Arts
## 118 Culture Culture Arts Arts
## 147 Culture Culture Culture
## 26 Culture Culture Culture
## 32 Daily Clip Report:: Daily Clip Report::
## 60 Daily Clip Report:: Daily Clip Report::
## 40 First Draft:: First Draft::
## 80 First Draft:: First Draft::
## 88 Foreign World
## 102 Foreign World
## 148 Foreign World World
## 11 Foreign Foreign World
## 50 Foreign Foreign World
## 61 Foreign Foreign World
## 92 Foreign Foreign World
## 8 Foreign Foreign World World
## 109 Foreign Foreign World World
## 41 Foreign Foreign World World
## 57 Magazine Magazine Magazine Magazine
## 110 Magazine Magazine Magazine Magazine
## 149 Magazine Magazine Magazine Magazine
## 120 Metro N.Y. / Region N.Y. / Region
## 150 Metro N.Y. / Region N.Y. / Region
## 121 Metro Metro N.Y. / Region N.Y. / Region
## 15 Metro Metro N.Y. / Region N.Y. / Region
## 82 Metro Metro N.Y. / Region N.Y. / Region
## 48 Metro Metro N.Y. / Region N.Y. / Region
## 37 Metro Metro N.Y. / Region N.Y. / Region
## 111 Metro Metro N.Y. / Region N.Y. / Region
## 65 Metro Metro N.Y. / Region N.Y. / Region
## 151 Metro Metro N.Y. / Region N.Y. / Region
## 78 myEducation U.S.
## 114 myEducation U.S.
## 182 myEducation U.S.
## 181 myEducation U.S.
## 33 myEducation U.S. U.S.
## 79 myEducation U.S. U.S.
## 116 myEducation U.S. U.S.
## 14 myEducation U.S. U.S.
## 52 myEducation U.S. U.S.
## 117 myEducation U.S. U.S.
## 183 myEducation U.S. U.S.
## 85 myEducation U.S. U.S.
## 115 myEducation U.S. U.S.
## 34 myEducation U.S. U.S.
## 72 myEducation U.S. U.S.
## 35 myEducation U.S. U.S.
## 73 myEducation U.S. U.S.
## 1 myMisc:: myMisc::
## 19 myMisc:: myMisc::
## 7 myMisc:: myMisc::
## 106 myMisc:: Open Open
## 184 myMisc:: Open Open
## 185 myMisc:: Travel Travel
## 107 myMisc:: Travel Travel
## 187 myMisc:: U.S. U.S.
## 186 myMisc:: U.S. U.S.
## 23 myMultimedia Multimedia Multimedia
## 134 myMultimedia Multimedia Multimedia
## 58 myMultimedia Multimedia Multimedia
## 42 myMultimedia Multimedia Multimedia
## 63 myMultimedia Multimedia Multimedia
## 188 myMultimedia Metro N.Y. / Region N.Y. / Region
## 122 National National National
## 123 National National U.S. U.S.
## 49 OpEd Opinion
## 87 OpEd Opinion
## 28 OpEd Opinion Opinion
## 70 OpEd Opinion Opinion
## 53 OpEd Opinion Opinion
## 125 OpEd Opinion Opinion
## 124 OpEd Opinion Opinion
## 152 OpEd Opinion Opinion
## 153 OpEd OpEd Opinion
## 71 OpEd OpEd Opinion
## 154 OpEd OpEd Opinion
## 157 OpEd OpEd Opinion Opinion
## 158 OpEd OpEd Opinion Opinion
## 18 OpEd OpEd Opinion Opinion
## 5 OpEd OpEd Opinion Opinion
## 12 OpEd OpEd Opinion Opinion
## 159 OpEd OpEd Opinion Opinion
## 126 OpEd OpEd Opinion Opinion
## 155 OpEd OpEd Opinion Opinion
## 103 OpEd OpEd Opinion Opinion
## 156 OpEd OpEd Opinion Opinion
## 94 Readers Respond:: Readers Respond::
## 95 Readers Respond:: Readers Respond::
## 104 Readers Respond:: Readers Respond::
## 96 Reporter's Notebook:: Reporter's Notebook::
## 160 Reporter's Notebook:: Reporter's Notebook::
## 93 Reporter's Notebook:: Reporter's Notebook::
## 161 Science Health Health
## 128 Science Science Health
## 127 Science Science Health
## 97 Science Science Health Health
## 89 Science Science Health Health
## 112 Science Science Health Health
## 77 Science Science Health Health
## 162 Science Science Health Health
## 44 Science Science Health Health
## 17 Science Science Health Health
## 43 Science Science Health Health
## 163 Science Science Health Health
## 164 Science Styles Health Health
## 165 Sports Sports Sports
## 166 Sports Sports Sports Sports
## 167 Styles Styles
## 168 Styles U.S. U.S.
## 27 Styles Styles Styles
## 169 Styles Styles Styles
## 83 Styles Styles Styles
## 129 Styles Styles Styles Style
## 130 Styles Styles Technology
## 170 Styles Styles Technology
## 113 Styles Styles U.S. U.S.
## 132 Styles Styles U.S. U.S.
## 29 Styles Styles U.S. U.S.
## 21 Styles Styles U.S. U.S.
## 38 Styles Styles U.S. U.S.
## 90 Styles Styles U.S. U.S.
## 105 Styles Styles U.S. U.S.
## 131 Styles Styles U.S. U.S.
## 178 The Daily Gift:: The Daily Gift::
## 133 The Daily Gift:: The Daily Gift::
## 47 Today in Politics:: Today in Politics::
## 64 Today in Politics:: Today in Politics::
## 179 Travel Travel Travel Travel
## 16 Travel Travel Travel Travel
## 180 Travel Travel Travel Travel
## 56 Travel Travel Travel Travel
## 171 TStyle TStyle
## 172 TStyle Culture TStyle Arts
## 173 TStyle Metro TStyle N.Y. / Region
## 46 TStyle Styles TStyle
## 174 TStyle Styles TStyle
## 176 TStyle TStyle Technology
## 177 TStyle TStyle Technology
## 13 TStyle TStyle TStyle
## 175 TStyle TStyle TStyle
## 4 TStyle TStyle TStyle
## 91 TStyle TStyle TStyle
## 24 TStyle TStyle TStyle
## 59 TStyle TStyle TStyle
## 62 TStyle TStyle TStyle
## 54 Verbatim:: Verbatim::
## 84 Verbatim:: Verbatim::
## Headline.pfx Popular.fctr .n
## 136 myMisc:: N 1
## 75 myMisc:: <NA> 15
## 137 myTech:: <NA> 1
## 135 Today in Small Business:: <NA> 1
## 138 myMisc:: N 1
## 99 myMisc:: <NA> 4
## 108 myTech:: N 3
## 139 myMisc:: <NA> 1
## 140 Today in Small Business:: <NA> 1
## 31 Morning Agenda:: N 62
## 74 Morning Agenda:: <NA> 16
## 2 myMisc:: N 862
## 22 myMisc:: Y 88
## 6 myMisc:: <NA> 295
## 76 myTech:: N 15
## 98 myTech:: Y 5
## 100 myTech:: <NA> 4
## 141 Readers Respond:: N 1
## 39 Today in Small Business:: N 58
## 81 Today in Small Business:: <NA> 13
## 142 myFood:: Y 1
## 66 myMisc:: N 19
## 20 myMisc:: Y 101
## 51 myMisc:: <NA> 38
## 143 myTech:: Y 1
## 101 myTech:: N 4
## 144 myTech:: Y 1
## 36 Daily Report:: N 60
## 67 Daily Report:: <NA> 18
## 145 myFood:: N 1
## 10 myMisc:: N 155
## 55 myMisc:: Y 32
## 25 myMisc:: <NA> 75
## 30 myTech:: N 64
## 69 myTech:: Y 18
## 68 myTech:: <NA> 18
## 86 myMisc:: <NA> 11
## 146 myFood:: N 1
## 3 myMisc:: N 622
## 45 myMisc:: Y 50
## 9 myMisc:: <NA> 160
## 119 myTech:: N 2
## 118 myTech:: <NA> 2
## 147 myMisc:: N 1
## 26 myMisc:: <NA> 70
## 32 Daily Clip Report:: N 62
## 60 Daily Clip Report:: <NA> 22
## 40 First Draft:: N 58
## 80 First Draft:: <NA> 14
## 88 19[0-9][0-9]:: N 9
## 102 19[0-9][0-9]:: <NA> 4
## 148 myMisc:: <NA> 1
## 11 19[0-9][0-9]:: N 141
## 50 19[0-9][0-9]:: <NA> 39
## 61 myMisc:: N 22
## 92 myMisc:: <NA> 7
## 8 myMisc:: N 209
## 109 myMisc:: Y 3
## 41 myMisc:: <NA> 56
## 57 myMisc:: N 30
## 110 myMisc:: <NA> 3
## 149 Readers Respond:: N 1
## 120 myMisc:: <NA> 2
## 150 New York Today:: <NA> 1
## 121 myFood:: N 2
## 15 myMisc:: N 119
## 82 myMisc:: Y 13
## 48 myMisc:: <NA> 43
## 37 New York Today:: N 59
## 111 New York Today:: Y 3
## 65 New York Today:: <NA> 20
## 151 Readers Respond:: Y 1
## 78 On This Day:: N 15
## 114 On This Day:: <NA> 3
## 182 Quiz(.*)([?=|]|[?=:]:: Y 1
## 181 Quiz(.*)([?=|]|[?=:]:: <NA> 1
## 33 6 Q's About the News:: N 61
## 79 6 Q's About the News:: <NA> 15
## 116 myFood:: N 3
## 14 myMisc:: N 124
## 52 myMisc:: <NA> 36
## 117 myTech:: N 3
## 183 myTech:: <NA> 1
## 85 Quiz(.*)([?=|]|[?=:]:: N 12
## 115 Quiz(.*)([?=|]|[?=:]:: <NA> 3
## 34 Test Yourself:: N 61
## 72 Test Yourself:: <NA> 17
## 35 Word of the Day:: N 61
## 73 Word of the Day:: <NA> 17
## 1 myMisc:: N 889
## 19 myMisc:: Y 107
## 7 myMisc:: <NA> 241
## 106 myMisc:: N 4
## 184 myMisc:: <NA> 1
## 185 myMisc:: N 1
## 107 myMisc:: <NA> 4
## 187 myMisc:: N 1
## 186 myMisc:: <NA> 1
## 23 myMisc:: N 86
## 134 myMisc:: Y 2
## 58 myMisc:: <NA> 30
## 42 Pictures of the (Day|Year|.):: N 53
## 63 Pictures of the (Day|Year|.):: <NA> 22
## 188 Pictures of the (Day|Year|.):: <NA> 1
## 122 myMisc:: N 2
## 123 myMisc:: N 2
## 49 What We're:: N 41
## 87 What We're:: <NA> 11
## 28 myMisc:: N 65
## 70 myMisc:: Y 17
## 53 myMisc:: <NA> 35
## 125 myTech:: N 2
## 124 Readers Respond:: N 2
## 152 Readers Respond:: Y 1
## 153 myMisc:: Y 1
## 71 myMisc:: <NA> 17
## 154 myTech:: <NA> 1
## 157 myFood:: N 1
## 158 myFood:: Y 1
## 18 myMisc:: N 107
## 5 myMisc:: Y 403
## 12 myMisc:: <NA> 140
## 159 myTech:: N 1
## 126 myTech:: Y 2
## 155 Readers Respond:: Y 1
## 103 What We're:: N 4
## 156 What We're:: <NA> 1
## 94 Readers Respond:: N 6
## 95 Readers Respond:: Y 6
## 104 Readers Respond:: <NA> 4
## 96 Reporter's Notebook:: N 6
## 160 Reporter's Notebook:: Y 1
## 93 Reporter's Notebook:: <NA> 7
## 161 myMisc:: Y 1
## 128 myMisc:: Y 2
## 127 myMisc:: <NA> 2
## 97 Ask Well:: N 6
## 89 Ask Well:: Y 9
## 112 Ask Well:: <NA> 3
## 77 myFood:: N 15
## 162 myFood:: Y 1
## 44 myMisc:: N 51
## 17 myMisc:: Y 109
## 43 myMisc:: <NA> 52
## 163 myTech:: N 1
## 164 myMisc:: N 1
## 165 myMisc:: N 1
## 166 myMisc:: N 1
## 167 myFood:: N 1
## 168 Your Turn:: <NA> 1
## 27 myMisc:: N 68
## 169 myMisc:: Y 1
## 83 myMisc:: <NA> 13
## 129 myMisc:: N 2
## 130 myTech:: N 2
## 170 myTech:: <NA> 1
## 113 myFood:: N 3
## 132 myFood:: Y 2
## 29 myMisc:: N 65
## 21 myMisc:: Y 94
## 38 myMisc:: <NA> 59
## 90 Your Turn:: N 9
## 105 Your Turn:: Y 4
## 131 Your Turn:: <NA> 2
## 178 The Daily Gift:: N 1
## 133 The Daily Gift:: <NA> 2
## 47 Today in Politics:: N 44
## 64 Today in Politics:: <NA> 21
## 179 myFood:: N 1
## 16 myMisc:: N 114
## 180 myMisc:: Y 1
## 56 myMisc:: <NA> 31
## 171 .*Fashion Week:: N 1
## 172 .*Fashion Week:: <NA> 1
## 173 .*Fashion Week:: N 1
## 46 .*Fashion Week:: N 46
## 174 .*Fashion Week:: <NA> 1
## 176 myTech:: N 1
## 177 myTech:: Y 1
## 13 .*Fashion Week:: N 136
## 175 myFood:: N 1
## 4 myMisc:: N 552
## 91 myMisc:: Y 8
## 24 myMisc:: <NA> 83
## 59 The Daily Gift:: N 25
## 62 The Daily Gift:: <NA> 22
## 54 Verbatim:: N 33
## 84 Verbatim:: <NA> 12
ndsecsub_xtab_df <- orderBy(reformulate(
c("NewsDesk.nb", "NewsDesk",
"SectionName.nb", "SectionName", "SubsectionName.nb", "SubsectionName",
"Headline.pfx", glb_rsp_var, "-", ".n")),
mycreate_sqlxtab_df(glb_entity_df,
c("NewsDesk.nb", "NewsDesk",
"SectionName.nb", "SectionName", "SubsectionName.nb", "SubsectionName",
"Headline.pfx", glb_rsp_var)))
myprint_df(ndsecsub_xtab_df)
## NewsDesk.nb NewsDesk SectionName.nb SectionName
## 85 Business Business Day Business Day
## 144 Business Business Day Business Day
## 146 Business Business Day Business Day
## 114 Business Business Day Business Day
## 145 Business Business Day Business Day
## 147 Business Crosswords/Games Crosswords/Games
## SubsectionName.nb SubsectionName Headline.pfx
## 85 Dealbook Dealbook myMisc::
## 144 Dealbook Dealbook myTech::
## 146 Small Business Small Business myMisc::
## 114 Small Business Small Business myMisc::
## 145 Small Business Small Business Today in Small Business::
## 147 Business::Crosswords/Games myMisc::
## Popular.fctr .n
## 85 <NA> 12
## 144 <NA> 1
## 146 N 1
## 114 <NA> 3
## 145 <NA> 1
## 147 N 1
## NewsDesk.nb NewsDesk SectionName.nb SectionName SubsectionName.nb
## 162 Metro N.Y. / Region N.Y. / Region Metro::N.Y. / Region
## 14 myEducation U.S. U.S. Education
## 89 OpEd Opinion OpEd::Opinion
## 167 OpEd OpEd Opinion OpEd::Opinion
## 95 TStyle TStyle TStyle TStyle::TStyle
## 59 TStyle TStyle TStyle TStyle::TStyle
## SubsectionName Headline.pfx Popular.fctr .n
## 162 New York Today:: <NA> 1
## 14 Education myMisc:: N 124
## 89 What We're:: <NA> 11
## 167 myTech:: <NA> 1
## 95 myMisc:: Y 8
## 59 The Daily Gift:: N 25
## NewsDesk.nb NewsDesk SectionName.nb SectionName SubsectionName.nb
## 95 TStyle TStyle TStyle TStyle::TStyle
## 24 TStyle TStyle TStyle TStyle::TStyle
## 59 TStyle TStyle TStyle TStyle::TStyle
## 63 TStyle TStyle TStyle TStyle::TStyle
## 54 Verbatim:: Verbatim:: Verbatim::Verbatim::
## 86 Verbatim:: Verbatim:: Verbatim::Verbatim::
## SubsectionName Headline.pfx Popular.fctr .n
## 95 myMisc:: Y 8
## 24 myMisc:: <NA> 83
## 59 The Daily Gift:: N 25
## 63 The Daily Gift:: <NA> 22
## 54 Verbatim:: N 33
## 86 Verbatim:: <NA> 12
print(nrow(ndsecsub_xtab_df))
## [1] 201
#print(subset(ndsecsub_xtab_df, NewsDesk == "OpEd"))
#print(subset(ndsecsub_xtab_df, SectionName.nb == "Health"))
write.table(ndsecsub_xtab_df, paste0(glb_out_pfx, "ndsecsub_xtab.csv"),
row.names=FALSE)
#stop("here")
# dsp_datagrp(Headline.pfx="What We're::", all=TRUE)
# #dsp_obs(Headline.pfx="What We're::")
# dsp_datagrp(all=TRUE)
# ## Grouping 6 Q's About the News:: into NewsDesk=myEducation - A new category
# glb_entity_df[glb_entity_df$Headline.pfx == "6 Q's About the News::", "NewsDesk"] <-
# "myEducation"
# dsp_datagrp(7, 17)
#
# ## Sample headlines from Daily Clip Report:: ?
# dsp_obs(Headline.pfx="Daily Clip Report::")
# ## Grouping Daily Clip Report:: into NewsDesk=myCollection - A new category
# ### SectionName=myCollection - A new category
# ### SubsectionName=myCollection - A new category
# glb_entity_df[glb_entity_df$Headline.pfx == "Daily Clip Report::", "NewsDesk"] <- "myCollection"
# glb_entity_df[glb_entity_df$Headline.pfx == "Daily Clip Report::", "SectionName"] <- "myCollection"
# glb_entity_df[glb_entity_df$Headline.pfx == "Daily Clip Report::", "SubsectionName"] <- "myCollection"
#
# ## Sample headlines from Daily Report:: ?
# dsp_obs(Headline.pfx="Daily Report::")
# ### What are the SubsectionNames for <>::Business::Technology ?
# print(orderBy(~ Headline.pfx+NewsDesk+SectionName+SubsectionName,
# mycreate_sqlxtab_df(glb_entity_df[sel_obs(
# NewsDesk="Business", SectionName="Technology"), ],
# c(glb_rsp_var, "Headline.pfx", "NewsDesk", "SectionName", "SubsectionName"))))
#
# dsp_obs(Headline.pfx="Daily Report::")
# ## Grouping Daily Report:: into SubsectionName=myBus::Tech - A new category
# glb_entity_df[glb_entity_df$Headline.pfx == "Daily Report::", "SubsectionName"] <- "myBus::Tech"
# dsp_datagrp(7, 17)
#
# ## Sample headlines from First Draft:: ?
# dsp_obs(Headline.pfx="First Draft::")
# ## Grouping First Draft:: into NewsDesk=myCollection - A new category
# ### SectionName=myCollection - A new category
# ### SubsectionName=myCollection - A new category
# glb_entity_df[glb_entity_df$Headline.pfx == "First Draft::", "NewsDesk"] <- "myCollection"
# glb_entity_df[glb_entity_df$Headline.pfx == "First Draft::", "SectionName"] <- "myCollection"
# glb_entity_df[glb_entity_df$Headline.pfx == "First Draft::", "SubsectionName"] <- "myCollection"
# dsp_datagrp(1, 20)
#
# ## How are the Milan Fashion Week:: blogs categorized ?
# print(orderBy(~ Headline.pfx+NewsDesk+SectionName+SubsectionName,
# mycreate_sqlxtab_df(glb_entity_df[sel_obs(
# Headline.contains="Fashion Week"), ],
# c(glb_rsp_var, "Headline.pfx", "NewsDesk", "SectionName", "SubsectionName"))))
# print(orderBy(~ Headline.pfx+NewsDesk+SectionName+SubsectionName,
# mycreate_sqlxtab_df(glb_entity_df[sel_obs(
# NewsDesk="Styles"), ],
# c(glb_rsp_var, "Headline.pfx", "NewsDesk", "SectionName", "SubsectionName"))))
# dsp_obs(Popular=1, NewsDesk="Styles")
#
# print(orderBy(~ Headline.pfx+NewsDesk+SectionName+SubsectionName,
# mycreate_sqlxtab_df(glb_entity_df[sel_obs(
# NewsDesk="TStyle"), ],
# c(glb_rsp_var, "Headline.pfx", "NewsDesk", "SectionName", "SubsectionName"))))
# #dsp_xtab("Fashion Week")
#
# ## Sample headlines from myMisc:: ?
# dsp_obs(Popular=1, Headline.pfx="myMisc::")
# dsp_obs(Headline.contains="Saturday Morning Music") # only 1 obs
# dsp_obs(Headline.pfx="myMisc::", Headline.contains=":")
# dsp_obs(Headline.contains="Charities That Inspire Kids")
dsp_chisq.test <- function(...) {
sel_df <- glb_entity_df[sel_obs(...) &
!is.na(glb_entity_df$Popular), ]
sel_df$.marker <- 1
ref_df <- glb_entity_df[!is.na(glb_entity_df$Popular), ]
mrg_df <- merge(ref_df[, c(glb_id_vars, "Popular")],
sel_df[, c(glb_id_vars, ".marker")], all.x=TRUE)
mrg_df[is.na(mrg_df)] <- 0
print(mrg_tbl <- table(mrg_df$.marker, mrg_df$Popular))
print("Rows:Selected; Cols:Popular")
#print(mrg_tbl)
print(chisq.test(mrg_tbl))
}
# dsp_chisq.test(Headline.contains="[Ee]bola")
# dsp_chisq.test(Snippet.contains="[Ee]bola")
# dsp_chisq.test(Abstract.contains="[Ee]bola")
# print(mycreate_sqlxtab_df(glb_entity_df[sel_obs(Headline.contains="[Ee]bola"), ],
# c(glb_rsp_var, "NewsDesk", "SectionName", "SubsectionName")))
#dsp_NewsDesk_SectionName_obs("", "U.S.")
# print(table(glb_entity_df$NewsDesk, glb_entity_df$SectionName))
# print(table(glb_entity_df$SectionName, glb_entity_df$SubsectionName))
# print(table(glb_entity_df$NewsDesk, glb_entity_df$SectionName, glb_entity_df$SubsectionName))
# Copy Headline into Snipper & Abstract if they are empty
print(glb_entity_df[nchar(glb_entity_df[, "Snippet"]) == 0, c("Headline", "Snippet")])
## Headline
## 2838 First Draft Focus: Off to Raise Money for Democrats
## 3728 Verbatim: Obama as Supreme Court Justice?
## 4904 Election 2014: Live Coverage
## 4994 Election 2014: Live Coverage
## 5029 First Draft Focus: Perry's Day in Court
## 5065 First Draft Focus: Honoring a Civil War Hero
## 5160 Supreme Court to Hear New Health Law Challenge
## 5254 Verbatim: Will Rick Perry Run?
## 5472 First Draft Focus: A Red Carpet Welcome
## 7129 First Draft Focus: Pass a Bill
## 7164 Does Torture Work? C.I.A.'s Claims vs. Senate Panel's Findings
## 7364 First Draft Focus: Three Wise Men
## 7368 Verbatim: The People's Priorities
## Snippet
## 2838
## 3728
## 4904
## 4994
## 5029
## 5065
## 5160
## 5254
## 5472
## 7129
## 7164
## 7364
## 7368
print(glb_entity_df[glb_entity_df$Headline == glb_entity_df$Snippet,
c("UniqueID", "Headline", "Snippet")])
## [1] UniqueID Headline Snippet
## <0 rows> (or 0-length row.names)
glb_entity_df[nchar(glb_entity_df[, "Snippet"]) == 0, "Snippet"] <-
glb_entity_df[nchar(glb_entity_df[, "Snippet"]) == 0, "Headline"]
print(glb_entity_df[nchar(glb_entity_df[, "Abstract"]) == 0, c("Headline", "Abstract")])
## Headline
## 2838 First Draft Focus: Off to Raise Money for Democrats
## 3728 Verbatim: Obama as Supreme Court Justice?
## 4904 Election 2014: Live Coverage
## 4994 Election 2014: Live Coverage
## 5029 First Draft Focus: Perry's Day in Court
## 5065 First Draft Focus: Honoring a Civil War Hero
## 5160 Supreme Court to Hear New Health Law Challenge
## 5254 Verbatim: Will Rick Perry Run?
## 5472 First Draft Focus: A Red Carpet Welcome
## 7129 First Draft Focus: Pass a Bill
## 7164 Does Torture Work? C.I.A.'s Claims vs. Senate Panel's Findings
## 7309 Spending Bill Passes House With Democratic Support
## 7310 Funding Bill Hangs in Balance as House Votes
## 7315 House Democrats Vent Frustration With White House
## 7329 Obama Works the Phones to Get Funding Deal Done
## 7364 First Draft Focus: Three Wise Men
## 7368 Verbatim: The People's Priorities
## Abstract
## 2838
## 3728
## 4904
## 4994
## 5029
## 5065
## 5160
## 5254
## 5472
## 7129
## 7164
## 7309
## 7310
## 7315
## 7329
## 7364
## 7368
print(glb_entity_df[glb_entity_df$Headline == glb_entity_df$Abstract,
c("UniqueID", "Headline", "Abstract")])
## [1] UniqueID Headline Abstract
## <0 rows> (or 0-length row.names)
glb_entity_df[nchar(glb_entity_df[, "Abstract"]) == 0, "Abstract"] <-
glb_entity_df[nchar(glb_entity_df[, "Abstract"]) == 0, "Headline"]
# WordCount_0_df <- subset(glb_entity_df, WordCount == 0)
# table(WordCount_0_df$Popular, WordCount_0_df$WordCount, useNA="ifany")
# myprint_df(WordCount_0_df[,
# c("UniqueID", "Popular", "WordCount", "Headline")])
glb_chunks_df <- myadd_chunk(glb_chunks_df, "manage.missing.data", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 3 cleanse.data 2 1 28.451 56.654 28.203
## 4 manage.missing.data 2 2 56.654 NA NA
2.2: manage missing data# print(sapply(names(glb_trnent_df), function(col) sum(is.na(glb_trnent_df[, col]))))
# print(sapply(names(glb_newent_df), function(col) sum(is.na(glb_newent_df[, col]))))
# glb_trnent_df <- na.omit(glb_trnent_df)
# glb_newent_df <- na.omit(glb_newent_df)
# df[is.na(df)] <- 0
dsp_problem_data(glb_entity_df)
## [1] "numeric data missing in : "
## WordCount Popular UniqueID
## 0 1870 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 1870 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 0 0
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 0 0
## WordCount.log .rnorm
## 109 0
## [1] "numeric data w/ 0s in : "
## WordCount Popular UniqueID
## 109 5439 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 0 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 378 159
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 1344 141
## WordCount.log .rnorm
## 0 0
## [1] "numeric data w/ Infs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 0 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 0 0
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 0 0
## WordCount.log .rnorm
## 0 0
## [1] "numeric data w/ NaNs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 0 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 0 0
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 0 0
## WordCount.log .rnorm
## 0 0
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline
## 2408 2899 6176 0
## Snippet Abstract PubDate Headline.pfx
## 0 0 0 0
## NewsDesk.nb SectionName.nb SubsectionName.nb
## 0 0 0
# Not refactored into mydsutils.R since glb_*_df might be reassigned
glb_impute_missing_data <- function() {
require(mice)
set.seed(glb_mice_complete.seed)
inp_impent_df <- glb_entity_df[, setdiff(names(glb_entity_df),
union(glb_exclude_vars_as_features, glb_rsp_var))]
print("Summary before imputation: ")
print(summary(inp_impent_df))
out_impent_df <- complete(mice(inp_impent_df))
print(summary(out_impent_df))
return(out_impent_df[, "WordCount.log"])
}
if (glb_impute_na_data)
glb_entity_df[, "WordCount.log"] <- glb_impute_missing_data()
## Loading required package: mice
## Loading required package: Rcpp
## Loading required package: lattice
## mice 2.22 2014-06-10
## [1] "Summary before imputation: "
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour PubDate.apm.fctr
## (0.97,7]:1981 0: 378 Min. : 0.00 am:3636
## (7,13] :1757 1:1605 1st Qu.: 9.00 pm:4766
## (13,19] :1808 2:1559 Median :12.00
## (19,25] :1650 3:1614 Mean :12.22
## (25,31] :1206 4:1539 3rd Qu.:16.00
## 5:1470 Max. :23.00
## 6: 237
## PubDate.minute PubDate.second WordCount.log .rnorm
## Min. : 0.00 Min. : 0.00 Min. :0.6932 Min. :-3.881663
## 1st Qu.: 5.00 1st Qu.:14.00 1st Qu.:5.2679 1st Qu.:-0.665043
## Median :24.00 Median :30.00 Median :5.9480 Median :-0.004510
## Mean :24.11 Mean :29.49 Mean :5.8263 Mean :-0.006807
## 3rd Qu.:40.00 3rd Qu.:44.00 3rd Qu.:6.6067 3rd Qu.: 0.664125
## Max. :59.00 Max. :59.00 Max. :9.2977 Max. : 3.356092
## NA's :109
## Headline.pfx NewsDesk.nb SectionName.nb
## Length:8402 Length:8402 Length:8402
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## SubsectionName.nb
## Length:8402
## Class :character
## Mode :character
##
##
##
##
##
## iter imp variable
## 1 1 WordCount.log
## 1 2 WordCount.log
## 1 3 WordCount.log
## 1 4 WordCount.log
## 1 5 WordCount.log
## 2 1 WordCount.log
## 2 2 WordCount.log
## 2 3 WordCount.log
## 2 4 WordCount.log
## 2 5 WordCount.log
## 3 1 WordCount.log
## 3 2 WordCount.log
## 3 3 WordCount.log
## 3 4 WordCount.log
## 3 5 WordCount.log
## 4 1 WordCount.log
## 4 2 WordCount.log
## 4 3 WordCount.log
## 4 4 WordCount.log
## 4 5 WordCount.log
## 5 1 WordCount.log
## 5 2 WordCount.log
## 5 3 WordCount.log
## 5 4 WordCount.log
## 5 5 WordCount.log
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour PubDate.apm.fctr
## (0.97,7]:1981 0: 378 Min. : 0.00 am:3636
## (7,13] :1757 1:1605 1st Qu.: 9.00 pm:4766
## (13,19] :1808 2:1559 Median :12.00
## (19,25] :1650 3:1614 Mean :12.22
## (25,31] :1206 4:1539 3rd Qu.:16.00
## 5:1470 Max. :23.00
## 6: 237
## PubDate.minute PubDate.second WordCount.log .rnorm
## Min. : 0.00 Min. : 0.00 Min. :0.6931 Min. :-3.881663
## 1st Qu.: 5.00 1st Qu.:14.00 1st Qu.:5.2730 1st Qu.:-0.665043
## Median :24.00 Median :30.00 Median :5.9467 Median :-0.004510
## Mean :24.11 Mean :29.49 Mean :5.8263 Mean :-0.006807
## 3rd Qu.:40.00 3rd Qu.:44.00 3rd Qu.:6.6067 3rd Qu.: 0.664125
## Max. :59.00 Max. :59.00 Max. :9.2977 Max. : 3.356092
##
## Headline.pfx NewsDesk.nb SectionName.nb
## Length:8402 Length:8402 Length:8402
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## SubsectionName.nb
## Length:8402
## Class :character
## Mode :character
##
##
##
##
dsp_problem_data(glb_entity_df)
## [1] "numeric data missing in : "
## WordCount Popular UniqueID
## 0 1870 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 1870 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 0 0
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 0 0
## WordCount.log .rnorm
## 0 0
## [1] "numeric data w/ 0s in : "
## WordCount Popular UniqueID
## 109 5439 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 0 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 378 159
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 1344 141
## WordCount.log .rnorm
## 0 0
## [1] "numeric data w/ Infs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 0 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 0 0
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 0 0
## WordCount.log .rnorm
## 0 0
## [1] "numeric data w/ NaNs in : "
## WordCount Popular UniqueID
## 0 0 0
## Popular.fctr PubDate.year PubDate.month.fctr
## 0 0 0
## PubDate.date.fctr PubDate.wkday.fctr PubDate.hour
## 0 0 0
## PubDate.apm.fctr PubDate.minute PubDate.second
## 0 0 0
## WordCount.log .rnorm
## 0 0
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline
## 2408 2899 6176 0
## Snippet Abstract PubDate Headline.pfx
## 0 0 0 0
## NewsDesk.nb SectionName.nb SubsectionName.nb
## 0 0 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "encode.data", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 4 manage.missing.data 2 2 56.654 62.058 5.404
## 5 encode.data 2 3 62.059 NA NA
2.3: encode data# map_<col_name>_df <- myimport_data(
# url="<map_url>",
# comment="map_<col_name>_df", print_diagn=TRUE)
# map_<col_name>_df <- read.csv(paste0(getwd(), "/data/<file_name>.csv"), strip.white=TRUE)
# glb_trnent_df <- mymap_codes(glb_trnent_df, "<from_col_name>", "<to_col_name>",
# map_<to_col_name>_df, map_join_col_name="<map_join_col_name>",
# map_tgt_col_name="<to_col_name>")
# glb_newent_df <- mymap_codes(glb_newent_df, "<from_col_name>", "<to_col_name>",
# map_<to_col_name>_df, map_join_col_name="<map_join_col_name>",
# map_tgt_col_name="<to_col_name>")
# glb_trnent_df$<col_name>.fctr <- factor(glb_trnent_df$<col_name>,
# as.factor(union(glb_trnent_df$<col_name>, glb_newent_df$<col_name>)))
# glb_newent_df$<col_name>.fctr <- factor(glb_newent_df$<col_name>,
# as.factor(union(glb_trnent_df$<col_name>, glb_newent_df$<col_name>)))
glb_chunks_df <- myadd_chunk(glb_chunks_df, "extract.features", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 5 encode.data 2 3 62.059 62.115 0.057
## 6 extract.features 3 0 62.116 NA NA
3.0: extract features#```{r extract_features, cache=FALSE, eval=glb_is_textual}
# Create new features that help prediction
# <col_name>.lag.2 <- lag(zoo(glb_trnent_df$<col_name>), -2, na.pad=TRUE)
# glb_trnent_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
# <col_name>.lag.2 <- lag(zoo(glb_newent_df$<col_name>), -2, na.pad=TRUE)
# glb_newent_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
#
# glb_newent_df[1, "<col_name>.lag.2"] <- glb_trnent_df[nrow(glb_trnent_df) - 1,
# "<col_name>"]
# glb_newent_df[2, "<col_name>.lag.2"] <- glb_trnent_df[nrow(glb_trnent_df),
# "<col_name>"]
# glb_entity_df <- mutate(glb_entity_df,
# A.has.http=ifelse(grepl("http",Added,fixed=TRUE), 1, 0)
# )
#
# glb_trnent_df <- mutate(glb_trnent_df,
# )
#
# glb_newent_df <- mutate(glb_newent_df,
# )
# Create factors of string variables
str_vars <- sapply(names(glb_entity_df), function(var)
ifelse(class(glb_entity_df[, var]) == "character", var, ""))
print(str_vars <- str_vars[str_vars != ""])
## NewsDesk SectionName SubsectionName
## "NewsDesk" "SectionName" "SubsectionName"
## Headline Snippet Abstract
## "Headline" "Snippet" "Abstract"
## PubDate .src Headline.pfx
## "PubDate" ".src" "Headline.pfx"
## NewsDesk.nb SectionName.nb SubsectionName.nb
## "NewsDesk.nb" "SectionName.nb" "SubsectionName.nb"
if (length(str_vars <- setdiff(str_vars,
glb_exclude_vars_as_features)) > 0) {
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, str_vars)
for (var in str_vars) {
warning("Creating factors of string variable: ", var,
": # of unique values: ", length(unique(glb_entity_df[, var])))
glb_entity_df[, paste0(var, ".fctr")] <- factor(glb_entity_df[, var],
as.factor(unique(glb_entity_df[, var])))
# glb_trnent_df[, paste0(var, ".fctr")] <- factor(glb_trnent_df[, var],
# as.factor(unique(glb_entity_df[, var])))
# glb_newent_df[, paste0(var, ".fctr")] <- factor(glb_newent_df[, var],
# as.factor(unique(glb_entity_df[, var])))
}
}
## Warning: Creating factors of string variable: Headline.pfx: # of unique
## values: 25
## Warning: Creating factors of string variable: NewsDesk.nb: # of unique
## values: 22
## Warning: Creating factors of string variable: SectionName.nb: # of unique
## values: 26
## Warning: Creating factors of string variable: SubsectionName.nb: # of
## unique values: 35
if (glb_is_textual) {
require(tm)
glb_corpus_lst <- list(); glb_full_DTM_lst <- list(); glb_sprs_DTM_lst <- list();
for (txt_var in glb_txt_vars) {
print(sprintf("Building corpus for %s...", txt_var))
# Combine "new york" to "newyork"
# shd be created as a tm_map::content_transformer
txt_df <- glb_entity_df[, txt_var]
txt_df <- gsub("[Nn]ew [Dd]elhi", "newdelhi", txt_df)
txt_df <- gsub("[Nn]ew [Gg]uinea", "newguinea", txt_df)
txt_df <- gsub("[Nn]ew [Jj]ersey", "newjersey", txt_df)
txt_df <- gsub("[Nn]ew [Oo]rleans", "neworleans", txt_df)
txt_df <- gsub("[Nn]ew [Yy]ear", "newyear", txt_df)
txt_df <- gsub("[Nn]ew [Yy]ork", "newyork", txt_df)
txt_df <- gsub("[Nn]ew [Zz]ealand", "newzealand", txt_df)
if (txt_var == "Headline") {
# dsp_chisq.test(Headline.contains="[Nn]ew ")
# print(head(txt_df[grep("[Nn]ew ", txt_df)]))
# print(tail(txt_df[grep("[Nn]ew ", txt_df)]))
# print(sample(txt_df[grep("[Nn]ew ", txt_df)], 5))
# print(length(txt_df[grep("[Nn]ew ", txt_df)]))
# print(txt_df[grep("[Nn]ew ", txt_df)][01:20])
# print(txt_df[grep("[Nn]ew ", txt_df)][21:40])
# print(txt_df[grep("[Nn]ew ", txt_df)][41:60])
# print(txt_df[grep("[Nn]ew ", txt_df)][61:80])
# print(txt_df[grep("[Nn]ew ", txt_df)][81:100])
# #print(length(txt_df[grep("[Nn]ew [Zz]ealand", txt_df)]))
# dsp_chisq.test(Headline.contains="[Nn]ew [Yy]ork")
# dsp_chisq.test(Headline.contains="[Re]eport")
# dsp_chisq.test(Snippet.contains="[Re]eport")
#
# dsp_chisq.test(Headline.contains="[Ww]eek")
# dsp_chisq.test(Headline.contains="[Dd]ay")
# dsp_chisq.test(Headline.contains="[Ff]ashion")
# dsp_chisq.test(Headline.contains="[Tt]oday")
# dsp_chisq.test(Headline.contains="[Dd]ail")
# dsp_chisq.test(Headline.contains="2014")
# dsp_chisq.test(Headline.contains="2015")
glb_append_stop_words[["Headline"]] <- c(NULL)
}
if (txt_var == "Snippet") {
# dsp_chisq.test(Snippet.contains="[Nn]ew ")
# print(head(txt_df[grep("[Nn]ew ", txt_df)]))
# print(tail(txt_df[grep("[Nn]ew ", txt_df)]))
# print(sample(txt_df[grep("[Nn]ew ", txt_df)], 5))
# print(length(txt_df[grep("[Nn]ew ", txt_df)]))
# print(txt_df[grep("[Nn]ew ", txt_df)][11:20])
# print(txt_df[grep("[Nn]ew ", txt_df)][21:30])
# print(txt_df[grep("[Nn]ew ", txt_df)][31:40])
# print(txt_df[grep("[Nn]ew ", txt_df)][41:50])
# print(txt_df[grep("[Nn]ew ", txt_df)][51:60])
# #print(length(txt_df[grep("[Nn]ew [Zz]ealand", txt_df)]))
# dsp_chisq.test(Snippet.contains="[Ww]ill")
# dsp_chisq.test(Snippet.contains="[Tt]ime")
# dsp_chisq.test(Snippet.contains="[Ww]eek")
# dsp_chisq.test(Snippet.contains="[Yy]ear")
# dsp_chisq.test(Snippet.contains="[Ne]w [Yy]ork")
# dsp_chisq.test(Snippet.contains="[Cc]ompan")
# dsp_chisq.test(Snippet.contains="[Oo]ne")
# dsp_chisq.test(Snippet.contains="[Rr]eport")
# dsp_chisq.test(Snippet.contains="[Pp]resid")
# dsp_chisq.test(Snippet.contains="[Ss]aid")
# dsp_chisq.test(Snippet.contains="[Cc]an")
# dsp_chisq.test(Snippet.contains="[Dd]ay")
glb_append_stop_words[["Snippet"]] <- c(NULL)
#c("can")
}
if (txt_var == "Abstract") {
# dsp_chisq.test(Abstract.contains="[Nn]ew ")
# print(head(txt_df[grep("[Nn]ew ", txt_df)]))
# print(tail(txt_df[grep("[Nn]ew ", txt_df)]))
# print(sample(txt_df[grep("[Nn]ew ", txt_df)], 5))
# print(length(txt_df[grep("[Nn]ew ", txt_df)]))
# print(txt_df[grep("[Nn]ew ", txt_df)][11:20])
# print(txt_df[grep("[Nn]ew ", txt_df)][21:30])
# print(txt_df[grep("[Nn]ew ", txt_df)][31:40])
# print(txt_df[grep("[Nn]ew ", txt_df)][41:50])
# print(txt_df[grep("[Nn]ew ", txt_df)][51:60])
# #print(length(txt_df[grep("[Nn]ew [Zz]ealand", txt_df)]))
#
# dsp_chisq.test(Abstract.contains="[Ww]ill")
# dsp_chisq.test(Abstract.contains="[Tt]ime")
# dsp_chisq.test(Abstract.contains="[Ww]eek")
# dsp_chisq.test(Abstract.contains="[Yy]ear")
# dsp_chisq.test(Abstract.contains="[Ne]w [Yy]ork")
# dsp_chisq.test(Abstract.contains="[Cc]ompan")
# dsp_chisq.test(Abstract.contains="[Oo]ne")
# dsp_chisq.test(Abstract.contains="[Rr]eport")
# dsp_chisq.test(Abstract.contains="[Pp]resid")
#
# dsp_chisq.test(Abstract.contains="[Ss]aid")
# dsp_chisq.test(Abstract.contains="[Cc]an")
# dsp_chisq.test(Abstract.contains="[Dd]ay")
# dsp_chisq.test(Abstract.contains="[Ss]tate")
# dsp_chisq.test(Abstract.contains="[Mm]ake")
# dsp_chisq.test(Abstract.contains="[Bb]ank")
glb_append_stop_words[["Abstract"]] <- c(NULL)
#c("fashion", "first", "intern", "make", "newyork", "report",
# "said", "share", "show", "state", "week", "year")
}
txt_corpus <- Corpus(VectorSource(txt_df))
txt_corpus <- tm_map(txt_corpus, tolower)
txt_corpus <- tm_map(txt_corpus, PlainTextDocument)
txt_corpus <- tm_map(txt_corpus, removePunctuation)
# txt-corpus <- tm_map(txt_corpus, content_transformer(function(x, pattern) gsub(pattern, "", x))
txt_corpus <- tm_map(txt_corpus, removeWords,
c(glb_append_stop_words[[txt_var]],
stopwords("english")))
txt_corpus <- tm_map(txt_corpus, stemDocument)
full_freqs_DTM <- DocumentTermMatrix(txt_corpus)
print(" Full freqs:"); print(full_freqs_DTM)
full_freqs_vctr <- colSums(as.matrix(full_freqs_DTM))
names(full_freqs_vctr) <- dimnames(full_freqs_DTM)[[2]]
full_freqs_df <- as.data.frame(full_freqs_vctr)
names(full_freqs_df) <- "freq.full"
full_freqs_df$term <- rownames(full_freqs_df)
full_freqs_df <- orderBy(~ -freq.full, full_freqs_df)
sprs_freqs_DTM <- removeSparseTerms(full_freqs_DTM,
glb_sprs_thresholds[txt_var])
print(" Sparse freqs:"); print(sprs_freqs_DTM)
sprs_freqs_vctr <- colSums(as.matrix(sprs_freqs_DTM))
names(sprs_freqs_vctr) <- dimnames(sprs_freqs_DTM)[[2]]
sprs_freqs_df <- as.data.frame(sprs_freqs_vctr)
names(sprs_freqs_df) <- "freq.sprs"
sprs_freqs_df$term <- rownames(sprs_freqs_df)
sprs_freqs_df <- orderBy(~ -freq.sprs, sprs_freqs_df)
terms_freqs_df <- merge(full_freqs_df, sprs_freqs_df, all.x=TRUE)
melt_freqs_df <- orderBy(~ -value, melt(terms_freqs_df, id.var="term"))
print(ggplot(melt_freqs_df, aes(value, color=variable)) + stat_ecdf() +
geom_hline(yintercept=glb_sprs_thresholds[txt_var],
linetype = "dotted"))
melt_freqs_df <- orderBy(~ -value,
melt(subset(terms_freqs_df, !is.na(freq.sprs)), id.var="term"))
print(myplot_hbar(melt_freqs_df, "term", "value",
colorcol_name="variable"))
melt_freqs_df <- orderBy(~ -value,
melt(subset(terms_freqs_df, is.na(freq.sprs)), id.var="term"))
print(myplot_hbar(head(melt_freqs_df, 10), "term", "value",
colorcol_name="variable"))
glb_corpus_lst[[txt_var]] <- txt_corpus
glb_full_DTM_lst[[txt_var]] <- full_freqs_DTM
glb_sprs_DTM_lst[[txt_var]] <- sprs_freqs_DTM
}
# Create txt features
if ((length(glb_txt_vars) > 1) &&
(length(unique(pfxs <- sapply(glb_txt_vars,
function(txt) toupper(substr(txt, 1, 1))))) < length(glb_txt_vars)))
stop("Prefixes for corpus freq terms not unique: ", pfxs)
for (txt_var in glb_txt_vars) {
print(sprintf("Binding DTM for %s...", txt_var))
txt_X_df <- as.data.frame(as.matrix(glb_sprs_DTM_lst[[txt_var]]))
colnames(txt_X_df) <- paste(toupper(substr(txt_var, 1, 1)), ".",
make.names(colnames(txt_X_df)), sep="")
rownames(txt_X_df) <- rownames(glb_entity_df) # warning otherwise
glb_entity_df <- cbind(glb_entity_df, txt_X_df)
# Create <txt_var>.has.http
glb_entity_df[, paste(toupper(substr(txt_var, 1, 1)), ".has.http", sep="")] <-
sapply(1:nrow(glb_entity_df),
function(row_ix) ifelse(grepl("http", glb_entity_df[row_ix, txt_var], fixed=TRUE),
1, 0))
# Create user-specified term vectors
# UniqueID == 4020, H.has.ebola
# dsp_chisq.test(Headline.contains="[Ee]bola")
# dsp_chisq.test( Snippet.contains="[Ee]bola")
# dsp_chisq.test(Abstract.contains="[Ee]bola")
if (txt_var == "Headline") {
glb_entity_df[, paste(toupper(substr(txt_var, 1, 1)), ".has.ebola", sep="")] <-
sapply(1:nrow(glb_entity_df),
function(row_ix) ifelse(grepl("[Ee]bola", glb_entity_df[row_ix, txt_var]),
1, 0))
glb_entity_df[, paste(toupper(substr(txt_var, 1, 1)), ".is.question", sep="")] <-
sapply(1:nrow(glb_entity_df),
function(row_ix) ifelse(grepl("\\?", glb_entity_df[row_ix, txt_var]),
1, 0))
}
# Create <txt_var>.num.chars
glb_entity_df[, paste(toupper(substr(txt_var, 1, 1)), ".num.chars", sep="")] <-
sapply(1:nrow(glb_entity_df),
function(row_ix) nchar(glb_entity_df[row_ix, txt_var]))
# Create <txt_var>.num.words & .num.words.unq
glb_entity_df[, paste(toupper(substr(txt_var, 1, 1)), ".num.words", sep="")] <-
rowSums(as.matrix(glb_full_DTM_lst[[txt_var]]))
glb_entity_df[, paste(toupper(substr(txt_var, 1, 1)), ".num.words.unq", sep="")] <-
rowSums(as.matrix(glb_full_DTM_lst[[txt_var]]) != 0)
for (feat in paste(toupper(substr(txt_var, 1, 1)),
c(".num.chars", ".num.words", ".num.words.unq"), sep="")) {
glb_entity_df[, paste0(feat, ".log")] <- log(1 + glb_entity_df[, feat])
print(myplot_box(glb_entity_df, paste0(feat, ".log"), glb_rsp_var))
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
feat)
}
}
# Generate summaries
# print(summary(glb_entity_df))
# print(sapply(names(glb_entity_df), function(col) sum(is.na(glb_entity_df[, col]))))
# print(summary(glb_trnent_df))
# print(sapply(names(glb_trnent_df), function(col) sum(is.na(glb_trnent_df[, col]))))
# print(summary(glb_newent_df))
# print(sapply(names(glb_newent_df), function(col) sum(is.na(glb_newent_df[, col]))))
}
## Loading required package: tm
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
## [1] "Building corpus for Headline..."
## [1] " Full freqs:"
## <<DocumentTermMatrix (documents: 8402, terms: 9205)>>
## Non-/sparse entries: 44361/77296049
## Sparsity : 100%
## Maximal term length: 31
## Weighting : term frequency (tf)
## [1] " Sparse freqs:"
## <<DocumentTermMatrix (documents: 8402, terms: 10)>>
## Non-/sparse entries: 2407/81613
## Sparsity : 97%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Warning: Removed 6 rows containing missing values (geom_path).
## [1] "Building corpus for Snippet..."
## [1] " Full freqs:"
## <<DocumentTermMatrix (documents: 8402, terms: 13822)>>
## Non-/sparse entries: 105519/116026925
## Sparsity : 100%
## Maximal term length: 25
## Weighting : term frequency (tf)
## [1] " Sparse freqs:"
## <<DocumentTermMatrix (documents: 8402, terms: 22)>>
## Non-/sparse entries: 8657/176187
## Sparsity : 95%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Warning: Removed 6 rows containing missing values (geom_path).
## [1] "Building corpus for Abstract..."
## [1] " Full freqs:"
## <<DocumentTermMatrix (documents: 8402, terms: 13866)>>
## Non-/sparse entries: 105900/116396232
## Sparsity : 100%
## Maximal term length: 112
## Weighting : term frequency (tf)
## [1] " Sparse freqs:"
## <<DocumentTermMatrix (documents: 8402, terms: 22)>>
## Non-/sparse entries: 8672/176172
## Sparsity : 95%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Warning: Removed 6 rows containing missing values (geom_path).
## [1] "Binding DTM for Headline..."
## [1] "Binding DTM for Snippet..."
## [1] "Binding DTM for Abstract..."
# Re-partition
glb_trnent_df <- subset(glb_entity_df, .src == "Train")
glb_newent_df <- subset(glb_entity_df, .src == "Test")
# print(sapply(names(glb_trnent_df), function(col) sum(is.na(glb_trnent_df[, col]))))
# print(sapply(names(glb_newent_df), function(col) sum(is.na(glb_newent_df[, col]))))
# print(myplot_scatter(glb_trnent_df, "<col1_name>", "<col2_name>", smooth=TRUE))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all","data.new")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "select.features", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 6 extract.features 3 0 62.116 154.693 92.578
## 7 select.features 4 0 154.694 NA NA
4.0: select featuresprint(glb_feats_df <- myselect_features(entity_df=glb_trnent_df,
exclude_vars_as_features=glb_exclude_vars_as_features,
rsp_var=glb_rsp_var))
## Warning in cor(data.matrix(entity_df[, sel_feats]), y =
## as.numeric(entity_df[, : the standard deviation is zero
## id cor.y exclude.as.feat
## Popular Popular 1.000000000 1
## WordCount.log WordCount.log 0.265952699 0
## WordCount WordCount 0.257526549 1
## S.num.words.unq.log S.num.words.unq.log -0.250796919 0
## A.num.words.unq.log A.num.words.unq.log -0.250601203 0
## S.num.words.log S.num.words.log -0.245354135 0
## A.num.words.log A.num.words.log -0.245073324 0
## S.num.chars.log S.num.chars.log -0.224692967 0
## A.num.chars.log A.num.chars.log -0.224548821 0
## SubsectionName.nb.fctr SubsectionName.nb.fctr -0.213860009 0
## S.num.words.unq S.num.words.unq -0.212102717 1
## A.num.words.unq A.num.words.unq -0.210242145 1
## S.num.words S.num.words -0.206385049 1
## H.num.words.unq.log H.num.words.unq.log -0.204496360 0
## A.num.words A.num.words -0.204211072 1
## H.num.words.log H.num.words.log -0.200686356 0
## H.num.words.unq H.num.words.unq -0.189702157 1
## H.num.words H.num.words -0.186036895 1
## S.num.chars S.num.chars -0.179331806 1
## A.num.chars A.num.chars -0.177037425 1
## NewsDesk.nb.fctr NewsDesk.nb.fctr -0.172482671 0
## H.num.chars.log H.num.chars.log -0.171062360 0
## PubDate.hour PubDate.hour 0.159167673 0
## SectionName.nb.fctr SectionName.nb.fctr -0.148701209 0
## H.num.chars H.num.chars -0.147211183 1
## H.is.question H.is.question 0.129154799 0
## PubDate.apm.fctr PubDate.apm.fctr 0.101472715 0
## Headline.pfx.fctr Headline.pfx.fctr -0.100052879 0
## S.fashion S.fashion -0.086446251 0
## A.fashion A.fashion -0.086446251 0
## S.week S.week -0.084814939 0
## A.week A.week -0.084814939 0
## H.fashion H.fashion -0.081708612 0
## H.week H.week -0.075105216 0
## H.daili H.daili -0.069192975 0
## S.intern S.intern -0.068485701 0
## A.intern A.intern -0.068485701 0
## H.X2015 H.X2015 -0.066584892 0
## H.report H.report -0.064948102 0
## H.today H.today -0.063723058 0
## S.newyork S.newyork -0.062117105 0
## A.newyork A.newyork -0.062117105 0
## H.day H.day -0.061669687 0
## A.will A.will -0.061025004 0
## S.will S.will -0.060575493 0
## S.articl S.articl -0.059520554 0
## A.articl A.articl -0.059520554 0
## H.newyork H.newyork -0.057970095 0
## A.time A.time -0.057790617 0
## S.time S.time -0.057595102 0
## S.first S.first -0.053388178 0
## A.first A.first -0.053388178 0
## H.new H.new -0.053121542 0
## A.compani A.compani -0.053099633 0
## S.compani S.compani -0.053012962 0
## S.year S.year -0.051146178 0
## A.year A.year -0.051146178 0
## S.share S.share -0.050329686 0
## A.share A.share -0.050329686 0
## S.report S.report -0.050211524 0
## A.report A.report -0.050211524 0
## S.show S.show -0.048801740 0
## A.show A.show -0.048801740 0
## H.X2014 H.X2014 -0.046206380 0
## A.day A.day -0.045909684 0
## S.day S.day -0.045649185 0
## PubDate.wkday.fctr PubDate.wkday.fctr -0.039801288 0
## A.new A.new -0.035359447 0
## S.new S.new -0.034948520 0
## A.can A.can 0.031498867 0
## PubDate.minute PubDate.minute -0.031469083 0
## S.can S.can 0.029999780 0
## A.take A.take -0.026086108 0
## H.has.ebola H.has.ebola 0.025881397 0
## S.take S.take -0.025762398 0
## S.make S.make 0.023138853 0
## A.make A.make 0.023138853 0
## S.presid S.presid -0.019828826 0
## A.presid A.presid -0.019828826 0
## PubDate.month.fctr PubDate.month.fctr 0.019148739 1
## A.has.http A.has.http -0.013592603 0
## PubDate.second PubDate.second -0.012253600 0
## UniqueID UniqueID 0.011824920 1
## PubDate.date.fctr PubDate.date.fctr -0.011647558 0
## .rnorm .rnorm -0.008703337 0
## S.one S.one 0.006342094 0
## S.state S.state 0.006069626 0
## A.state A.state 0.005702163 0
## A.one A.one 0.005696039 0
## S.said S.said 0.001363226 0
## A.said A.said 0.001363226 0
## PubDate.year PubDate.year NA 1
## H.has.http H.has.http NA 0
## S.has.http S.has.http NA 0
## cor.y.abs
## Popular 1.000000000
## WordCount.log 0.265952699
## WordCount 0.257526549
## S.num.words.unq.log 0.250796919
## A.num.words.unq.log 0.250601203
## S.num.words.log 0.245354135
## A.num.words.log 0.245073324
## S.num.chars.log 0.224692967
## A.num.chars.log 0.224548821
## SubsectionName.nb.fctr 0.213860009
## S.num.words.unq 0.212102717
## A.num.words.unq 0.210242145
## S.num.words 0.206385049
## H.num.words.unq.log 0.204496360
## A.num.words 0.204211072
## H.num.words.log 0.200686356
## H.num.words.unq 0.189702157
## H.num.words 0.186036895
## S.num.chars 0.179331806
## A.num.chars 0.177037425
## NewsDesk.nb.fctr 0.172482671
## H.num.chars.log 0.171062360
## PubDate.hour 0.159167673
## SectionName.nb.fctr 0.148701209
## H.num.chars 0.147211183
## H.is.question 0.129154799
## PubDate.apm.fctr 0.101472715
## Headline.pfx.fctr 0.100052879
## S.fashion 0.086446251
## A.fashion 0.086446251
## S.week 0.084814939
## A.week 0.084814939
## H.fashion 0.081708612
## H.week 0.075105216
## H.daili 0.069192975
## S.intern 0.068485701
## A.intern 0.068485701
## H.X2015 0.066584892
## H.report 0.064948102
## H.today 0.063723058
## S.newyork 0.062117105
## A.newyork 0.062117105
## H.day 0.061669687
## A.will 0.061025004
## S.will 0.060575493
## S.articl 0.059520554
## A.articl 0.059520554
## H.newyork 0.057970095
## A.time 0.057790617
## S.time 0.057595102
## S.first 0.053388178
## A.first 0.053388178
## H.new 0.053121542
## A.compani 0.053099633
## S.compani 0.053012962
## S.year 0.051146178
## A.year 0.051146178
## S.share 0.050329686
## A.share 0.050329686
## S.report 0.050211524
## A.report 0.050211524
## S.show 0.048801740
## A.show 0.048801740
## H.X2014 0.046206380
## A.day 0.045909684
## S.day 0.045649185
## PubDate.wkday.fctr 0.039801288
## A.new 0.035359447
## S.new 0.034948520
## A.can 0.031498867
## PubDate.minute 0.031469083
## S.can 0.029999780
## A.take 0.026086108
## H.has.ebola 0.025881397
## S.take 0.025762398
## S.make 0.023138853
## A.make 0.023138853
## S.presid 0.019828826
## A.presid 0.019828826
## PubDate.month.fctr 0.019148739
## A.has.http 0.013592603
## PubDate.second 0.012253600
## UniqueID 0.011824920
## PubDate.date.fctr 0.011647558
## .rnorm 0.008703337
## S.one 0.006342094
## S.state 0.006069626
## A.state 0.005702163
## A.one 0.005696039
## S.said 0.001363226
## A.said 0.001363226
## PubDate.year NA
## H.has.http NA
## S.has.http NA
print(glb_feats_df <- orderBy(~-cor.y,
myfind_cor_features(feats_df=glb_feats_df, entity_df=glb_trnent_df,
rsp_var=glb_rsp_var,
checkConditionalX=(glb_is_classification && glb_is_binomial))))
## Loading required package: caret
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:survival':
##
## cluster
## [1] "cor(A.articl, S.articl)=1.0000"
## [1] "cor(Popular.fctr, A.articl)=-0.0595"
## [1] "cor(Popular.fctr, S.articl)=-0.0595"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.articl as highly correlated with A.articl
## [1] "cor(A.fashion, S.fashion)=1.0000"
## [1] "cor(Popular.fctr, A.fashion)=-0.0864"
## [1] "cor(Popular.fctr, S.fashion)=-0.0864"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.fashion as highly correlated with A.fashion
## [1] "cor(A.first, S.first)=1.0000"
## [1] "cor(Popular.fctr, A.first)=-0.0534"
## [1] "cor(Popular.fctr, S.first)=-0.0534"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.first as highly correlated with A.first
## [1] "cor(A.intern, S.intern)=1.0000"
## [1] "cor(Popular.fctr, A.intern)=-0.0685"
## [1] "cor(Popular.fctr, S.intern)=-0.0685"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.intern as highly correlated with A.intern
## [1] "cor(A.make, S.make)=1.0000"
## [1] "cor(Popular.fctr, A.make)=0.0231"
## [1] "cor(Popular.fctr, S.make)=0.0231"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.make as highly correlated with A.make
## [1] "cor(A.newyork, S.newyork)=1.0000"
## [1] "cor(Popular.fctr, A.newyork)=-0.0621"
## [1] "cor(Popular.fctr, S.newyork)=-0.0621"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.newyork as highly correlated with A.newyork
## [1] "cor(A.presid, S.presid)=1.0000"
## [1] "cor(Popular.fctr, A.presid)=-0.0198"
## [1] "cor(Popular.fctr, S.presid)=-0.0198"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.presid as highly correlated with A.presid
## [1] "cor(A.report, S.report)=1.0000"
## [1] "cor(Popular.fctr, A.report)=-0.0502"
## [1] "cor(Popular.fctr, S.report)=-0.0502"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.report as highly correlated with A.report
## [1] "cor(A.share, S.share)=1.0000"
## [1] "cor(Popular.fctr, A.share)=-0.0503"
## [1] "cor(Popular.fctr, S.share)=-0.0503"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.share as highly correlated with A.share
## [1] "cor(A.show, S.show)=1.0000"
## [1] "cor(Popular.fctr, A.show)=-0.0488"
## [1] "cor(Popular.fctr, S.show)=-0.0488"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.show as highly correlated with A.show
## [1] "cor(A.week, S.week)=1.0000"
## [1] "cor(Popular.fctr, A.week)=-0.0848"
## [1] "cor(Popular.fctr, S.week)=-0.0848"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.week as highly correlated with A.week
## [1] "cor(A.year, S.year)=1.0000"
## [1] "cor(Popular.fctr, A.year)=-0.0511"
## [1] "cor(Popular.fctr, S.year)=-0.0511"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.year as highly correlated with A.year
## [1] "cor(A.time, S.time)=0.9991"
## [1] "cor(Popular.fctr, A.time)=-0.0578"
## [1] "cor(Popular.fctr, S.time)=-0.0576"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.time as highly correlated with A.time
## [1] "cor(A.num.words.unq.log, S.num.words.unq.log)=0.9989"
## [1] "cor(Popular.fctr, A.num.words.unq.log)=-0.2506"
## [1] "cor(Popular.fctr, S.num.words.unq.log)=-0.2508"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified A.num.words.unq.log as highly correlated with
## S.num.words.unq.log
## [1] "cor(A.num.words.log, S.num.words.log)=0.9988"
## [1] "cor(Popular.fctr, A.num.words.log)=-0.2451"
## [1] "cor(Popular.fctr, S.num.words.log)=-0.2454"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified A.num.words.log as highly correlated with
## S.num.words.log
## [1] "cor(A.compani, S.compani)=0.9988"
## [1] "cor(Popular.fctr, A.compani)=-0.0531"
## [1] "cor(Popular.fctr, S.compani)=-0.0530"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.compani as highly correlated with A.compani
## [1] "cor(A.num.chars.log, S.num.chars.log)=0.9986"
## [1] "cor(Popular.fctr, A.num.chars.log)=-0.2245"
## [1] "cor(Popular.fctr, S.num.chars.log)=-0.2247"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified A.num.chars.log as highly correlated with
## S.num.chars.log
## [1] "cor(A.new, S.new)=0.9983"
## [1] "cor(Popular.fctr, A.new)=-0.0354"
## [1] "cor(Popular.fctr, S.new)=-0.0349"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.new as highly correlated with A.new
## [1] "cor(A.can, S.can)=0.9982"
## [1] "cor(Popular.fctr, A.can)=0.0315"
## [1] "cor(Popular.fctr, S.can)=0.0300"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.can as highly correlated with A.can
## [1] "cor(A.day, S.day)=0.9981"
## [1] "cor(Popular.fctr, A.day)=-0.0459"
## [1] "cor(Popular.fctr, S.day)=-0.0456"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.day as highly correlated with A.day
## [1] "cor(A.take, S.take)=0.9976"
## [1] "cor(Popular.fctr, A.take)=-0.0261"
## [1] "cor(Popular.fctr, S.take)=-0.0258"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.take as highly correlated with A.take
## [1] "cor(A.will, S.will)=0.9976"
## [1] "cor(Popular.fctr, A.will)=-0.0610"
## [1] "cor(Popular.fctr, S.will)=-0.0606"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.will as highly correlated with A.will
## [1] "cor(H.num.words.log, H.num.words.unq.log)=0.9967"
## [1] "cor(Popular.fctr, H.num.words.log)=-0.2007"
## [1] "cor(Popular.fctr, H.num.words.unq.log)=-0.2045"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified H.num.words.log as highly correlated with
## H.num.words.unq.log
## [1] "cor(S.num.words.log, S.num.words.unq.log)=0.9954"
## [1] "cor(Popular.fctr, S.num.words.log)=-0.2454"
## [1] "cor(Popular.fctr, S.num.words.unq.log)=-0.2508"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.num.words.log as highly correlated with
## S.num.words.unq.log
## [1] "cor(S.num.chars.log, S.num.words.unq.log)=0.9543"
## [1] "cor(Popular.fctr, S.num.chars.log)=-0.2247"
## [1] "cor(Popular.fctr, S.num.words.unq.log)=-0.2508"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified S.num.chars.log as highly correlated with
## S.num.words.unq.log
## [1] "cor(NewsDesk.nb.fctr, SectionName.nb.fctr)=0.9159"
## [1] "cor(Popular.fctr, NewsDesk.nb.fctr)=-0.1725"
## [1] "cor(Popular.fctr, SectionName.nb.fctr)=-0.1487"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified SectionName.nb.fctr as highly correlated with
## NewsDesk.nb.fctr
## [1] "cor(H.num.chars.log, H.num.words.unq.log)=0.8881"
## [1] "cor(Popular.fctr, H.num.chars.log)=-0.1711"
## [1] "cor(Popular.fctr, H.num.words.unq.log)=-0.2045"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified H.num.chars.log as highly correlated with
## H.num.words.unq.log
## [1] "cor(NewsDesk.nb.fctr, SubsectionName.nb.fctr)=0.8492"
## [1] "cor(Popular.fctr, NewsDesk.nb.fctr)=-0.1725"
## [1] "cor(Popular.fctr, SubsectionName.nb.fctr)=-0.2139"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified NewsDesk.nb.fctr as highly correlated with
## SubsectionName.nb.fctr
## [1] "cor(PubDate.apm.fctr, PubDate.hour)=0.8156"
## [1] "cor(Popular.fctr, PubDate.apm.fctr)=0.1015"
## [1] "cor(Popular.fctr, PubDate.hour)=0.1592"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified PubDate.apm.fctr as highly correlated with
## PubDate.hour
## [1] "cor(H.fashion, H.week)=0.7616"
## [1] "cor(Popular.fctr, H.fashion)=-0.0817"
## [1] "cor(Popular.fctr, H.week)=-0.0751"
## Warning in myfind_cor_features(feats_df = glb_feats_df, entity_df =
## glb_trnent_df, : Identified H.week as highly correlated with H.fashion
## id cor.y exclude.as.feat
## Popular Popular 1.000000000 1
## WordCount.log WordCount.log 0.265952699 0
## WordCount WordCount 0.257526549 1
## PubDate.hour PubDate.hour 0.159167673 0
## H.is.question H.is.question 0.129154799 0
## PubDate.apm.fctr PubDate.apm.fctr 0.101472715 0
## A.can A.can 0.031498867 0
## S.can S.can 0.029999780 0
## H.has.ebola H.has.ebola 0.025881397 0
## S.make S.make 0.023138853 0
## A.make A.make 0.023138853 0
## PubDate.month.fctr PubDate.month.fctr 0.019148739 1
## UniqueID UniqueID 0.011824920 1
## S.one S.one 0.006342094 0
## S.state S.state 0.006069626 0
## A.state A.state 0.005702163 0
## A.one A.one 0.005696039 0
## S.said S.said 0.001363226 0
## A.said A.said 0.001363226 0
## .rnorm .rnorm -0.008703337 0
## PubDate.date.fctr PubDate.date.fctr -0.011647558 0
## PubDate.second PubDate.second -0.012253600 0
## A.has.http A.has.http -0.013592603 0
## S.presid S.presid -0.019828826 0
## A.presid A.presid -0.019828826 0
## S.take S.take -0.025762398 0
## A.take A.take -0.026086108 0
## PubDate.minute PubDate.minute -0.031469083 0
## S.new S.new -0.034948520 0
## A.new A.new -0.035359447 0
## PubDate.wkday.fctr PubDate.wkday.fctr -0.039801288 0
## S.day S.day -0.045649185 0
## A.day A.day -0.045909684 0
## H.X2014 H.X2014 -0.046206380 0
## S.show S.show -0.048801740 0
## A.show A.show -0.048801740 0
## S.report S.report -0.050211524 0
## A.report A.report -0.050211524 0
## S.share S.share -0.050329686 0
## A.share A.share -0.050329686 0
## S.year S.year -0.051146178 0
## A.year A.year -0.051146178 0
## S.compani S.compani -0.053012962 0
## A.compani A.compani -0.053099633 0
## H.new H.new -0.053121542 0
## S.first S.first -0.053388178 0
## A.first A.first -0.053388178 0
## S.time S.time -0.057595102 0
## A.time A.time -0.057790617 0
## H.newyork H.newyork -0.057970095 0
## S.articl S.articl -0.059520554 0
## A.articl A.articl -0.059520554 0
## S.will S.will -0.060575493 0
## A.will A.will -0.061025004 0
## H.day H.day -0.061669687 0
## S.newyork S.newyork -0.062117105 0
## A.newyork A.newyork -0.062117105 0
## H.today H.today -0.063723058 0
## H.report H.report -0.064948102 0
## H.X2015 H.X2015 -0.066584892 0
## S.intern S.intern -0.068485701 0
## A.intern A.intern -0.068485701 0
## H.daili H.daili -0.069192975 0
## H.week H.week -0.075105216 0
## H.fashion H.fashion -0.081708612 0
## S.week S.week -0.084814939 0
## A.week A.week -0.084814939 0
## S.fashion S.fashion -0.086446251 0
## A.fashion A.fashion -0.086446251 0
## Headline.pfx.fctr Headline.pfx.fctr -0.100052879 0
## H.num.chars H.num.chars -0.147211183 1
## SectionName.nb.fctr SectionName.nb.fctr -0.148701209 0
## H.num.chars.log H.num.chars.log -0.171062360 0
## NewsDesk.nb.fctr NewsDesk.nb.fctr -0.172482671 0
## A.num.chars A.num.chars -0.177037425 1
## S.num.chars S.num.chars -0.179331806 1
## H.num.words H.num.words -0.186036895 1
## H.num.words.unq H.num.words.unq -0.189702157 1
## H.num.words.log H.num.words.log -0.200686356 0
## A.num.words A.num.words -0.204211072 1
## H.num.words.unq.log H.num.words.unq.log -0.204496360 0
## S.num.words S.num.words -0.206385049 1
## A.num.words.unq A.num.words.unq -0.210242145 1
## S.num.words.unq S.num.words.unq -0.212102717 1
## SubsectionName.nb.fctr SubsectionName.nb.fctr -0.213860009 0
## A.num.chars.log A.num.chars.log -0.224548821 0
## S.num.chars.log S.num.chars.log -0.224692967 0
## A.num.words.log A.num.words.log -0.245073324 0
## S.num.words.log S.num.words.log -0.245354135 0
## A.num.words.unq.log A.num.words.unq.log -0.250601203 0
## S.num.words.unq.log S.num.words.unq.log -0.250796919 0
## PubDate.year PubDate.year NA 1
## H.has.http H.has.http NA 0
## S.has.http S.has.http NA 0
## cor.y.abs cor.high.X is.ConditionalX.y
## Popular 1.000000000 <NA> NA
## WordCount.log 0.265952699 <NA> TRUE
## WordCount 0.257526549 <NA> NA
## PubDate.hour 0.159167673 PubDate.apm.fctr TRUE
## H.is.question 0.129154799 <NA> TRUE
## PubDate.apm.fctr 0.101472715 <NA> TRUE
## A.can 0.031498867 S.can TRUE
## S.can 0.029999780 <NA> TRUE
## H.has.ebola 0.025881397 <NA> TRUE
## S.make 0.023138853 <NA> TRUE
## A.make 0.023138853 S.make TRUE
## PubDate.month.fctr 0.019148739 <NA> NA
## UniqueID 0.011824920 <NA> NA
## S.one 0.006342094 <NA> TRUE
## S.state 0.006069626 <NA> TRUE
## A.state 0.005702163 <NA> TRUE
## A.one 0.005696039 <NA> TRUE
## S.said 0.001363226 <NA> TRUE
## A.said 0.001363226 <NA> TRUE
## .rnorm 0.008703337 <NA> TRUE
## PubDate.date.fctr 0.011647558 <NA> TRUE
## PubDate.second 0.012253600 <NA> TRUE
## A.has.http 0.013592603 <NA> FALSE
## S.presid 0.019828826 <NA> TRUE
## A.presid 0.019828826 S.presid TRUE
## S.take 0.025762398 <NA> TRUE
## A.take 0.026086108 S.take TRUE
## PubDate.minute 0.031469083 <NA> TRUE
## S.new 0.034948520 <NA> TRUE
## A.new 0.035359447 S.new TRUE
## PubDate.wkday.fctr 0.039801288 <NA> TRUE
## S.day 0.045649185 <NA> TRUE
## A.day 0.045909684 S.day TRUE
## H.X2014 0.046206380 <NA> TRUE
## S.show 0.048801740 <NA> TRUE
## A.show 0.048801740 S.show TRUE
## S.report 0.050211524 <NA> TRUE
## A.report 0.050211524 S.report TRUE
## S.share 0.050329686 <NA> TRUE
## A.share 0.050329686 S.share TRUE
## S.year 0.051146178 <NA> TRUE
## A.year 0.051146178 S.year TRUE
## S.compani 0.053012962 <NA> TRUE
## A.compani 0.053099633 S.compani TRUE
## H.new 0.053121542 <NA> TRUE
## S.first 0.053388178 <NA> TRUE
## A.first 0.053388178 S.first TRUE
## S.time 0.057595102 <NA> TRUE
## A.time 0.057790617 S.time TRUE
## H.newyork 0.057970095 <NA> TRUE
## S.articl 0.059520554 <NA> TRUE
## A.articl 0.059520554 S.articl TRUE
## S.will 0.060575493 <NA> TRUE
## A.will 0.061025004 S.will TRUE
## H.day 0.061669687 <NA> TRUE
## S.newyork 0.062117105 <NA> TRUE
## A.newyork 0.062117105 S.newyork TRUE
## H.today 0.063723058 <NA> TRUE
## H.report 0.064948102 <NA> TRUE
## H.X2015 0.066584892 <NA> FALSE
## S.intern 0.068485701 <NA> TRUE
## A.intern 0.068485701 S.intern TRUE
## H.daili 0.069192975 <NA> FALSE
## H.week 0.075105216 <NA> TRUE
## H.fashion 0.081708612 H.week TRUE
## S.week 0.084814939 <NA> TRUE
## A.week 0.084814939 S.week TRUE
## S.fashion 0.086446251 <NA> TRUE
## A.fashion 0.086446251 S.fashion TRUE
## Headline.pfx.fctr 0.100052879 <NA> TRUE
## H.num.chars 0.147211183 <NA> NA
## SectionName.nb.fctr 0.148701209 <NA> TRUE
## H.num.chars.log 0.171062360 <NA> TRUE
## NewsDesk.nb.fctr 0.172482671 SectionName.nb.fctr TRUE
## A.num.chars 0.177037425 <NA> NA
## S.num.chars 0.179331806 <NA> NA
## H.num.words 0.186036895 <NA> NA
## H.num.words.unq 0.189702157 <NA> NA
## H.num.words.log 0.200686356 <NA> TRUE
## A.num.words 0.204211072 <NA> NA
## H.num.words.unq.log 0.204496360 H.num.chars.log TRUE
## S.num.words 0.206385049 <NA> NA
## A.num.words.unq 0.210242145 <NA> NA
## S.num.words.unq 0.212102717 <NA> NA
## SubsectionName.nb.fctr 0.213860009 NewsDesk.nb.fctr TRUE
## A.num.chars.log 0.224548821 <NA> TRUE
## S.num.chars.log 0.224692967 A.num.chars.log TRUE
## A.num.words.log 0.245073324 <NA> TRUE
## S.num.words.log 0.245354135 A.num.words.log TRUE
## A.num.words.unq.log 0.250601203 <NA> TRUE
## S.num.words.unq.log 0.250796919 S.num.chars.log TRUE
## PubDate.year NA <NA> NA
## H.has.http NA <NA> FALSE
## S.has.http NA <NA> FALSE
## is.cor.y.abs.low
## Popular FALSE
## WordCount.log FALSE
## WordCount FALSE
## PubDate.hour FALSE
## H.is.question FALSE
## PubDate.apm.fctr FALSE
## A.can FALSE
## S.can FALSE
## H.has.ebola FALSE
## S.make FALSE
## A.make FALSE
## PubDate.month.fctr FALSE
## UniqueID FALSE
## S.one TRUE
## S.state TRUE
## A.state TRUE
## A.one TRUE
## S.said TRUE
## A.said TRUE
## .rnorm FALSE
## PubDate.date.fctr FALSE
## PubDate.second FALSE
## A.has.http FALSE
## S.presid FALSE
## A.presid FALSE
## S.take FALSE
## A.take FALSE
## PubDate.minute FALSE
## S.new FALSE
## A.new FALSE
## PubDate.wkday.fctr FALSE
## S.day FALSE
## A.day FALSE
## H.X2014 FALSE
## S.show FALSE
## A.show FALSE
## S.report FALSE
## A.report FALSE
## S.share FALSE
## A.share FALSE
## S.year FALSE
## A.year FALSE
## S.compani FALSE
## A.compani FALSE
## H.new FALSE
## S.first FALSE
## A.first FALSE
## S.time FALSE
## A.time FALSE
## H.newyork FALSE
## S.articl FALSE
## A.articl FALSE
## S.will FALSE
## A.will FALSE
## H.day FALSE
## S.newyork FALSE
## A.newyork FALSE
## H.today FALSE
## H.report FALSE
## H.X2015 FALSE
## S.intern FALSE
## A.intern FALSE
## H.daili FALSE
## H.week FALSE
## H.fashion FALSE
## S.week FALSE
## A.week FALSE
## S.fashion FALSE
## A.fashion FALSE
## Headline.pfx.fctr FALSE
## H.num.chars FALSE
## SectionName.nb.fctr FALSE
## H.num.chars.log FALSE
## NewsDesk.nb.fctr FALSE
## A.num.chars FALSE
## S.num.chars FALSE
## H.num.words FALSE
## H.num.words.unq FALSE
## H.num.words.log FALSE
## A.num.words FALSE
## H.num.words.unq.log FALSE
## S.num.words FALSE
## A.num.words.unq FALSE
## S.num.words.unq FALSE
## SubsectionName.nb.fctr FALSE
## A.num.chars.log FALSE
## S.num.chars.log FALSE
## A.num.words.log FALSE
## S.num.words.log FALSE
## A.num.words.unq.log FALSE
## S.num.words.unq.log FALSE
## PubDate.year NA
## H.has.http NA
## S.has.http NA
glb_chunks_df <- myadd_chunk(glb_chunks_df, "partition.data.training", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 7 select.features 4 0 154.694 165.031 10.337
## 8 partition.data.training 5 0 165.032 NA NA
5.0: partition data trainingif (all(is.na(glb_newent_df[, glb_rsp_var]))) {
require(caTools)
set.seed(glb_split_sample.seed)
split <- sample.split(glb_trnent_df[, glb_rsp_var_raw],
SplitRatio=1 - (nrow(glb_newent_df) * 1.1 / nrow(glb_trnent_df)))
glb_fitent_df <- glb_trnent_df[split, ]
glb_OOBent_df <- glb_trnent_df[!split ,]
} else {
print(sprintf("Newdata contains non-NA data for %s; setting OOB to Newdata",
glb_rsp_var))
glb_fitent_df <- glb_trnent_df; glb_OOBent_df <- glb_newent_df
}
## Loading required package: caTools
if (!is.null(glb_max_fitent_obs) && (nrow(glb_fitent_df) > glb_max_fitent_obs)) {
warning("glb_fitent_df restricted to glb_max_fitent_obs: ",
format(glb_max_fitent_obs, big.mark=","))
org_fitent_df <- glb_fitent_df
glb_fitent_df <-
org_fitent_df[split <- sample.split(org_fitent_df[, glb_rsp_var_raw],
SplitRatio=glb_max_fitent_obs), ]
org_fitent_df <- NULL
}
sav_entity_df <- glb_entity_df
glb_entity_df$.lcn <- ""
glb_entity_df[glb_entity_df[, glb_id_vars] %in%
glb_fitent_df[, glb_id_vars], ".lcn"] <- "Fit"
glb_entity_df[glb_entity_df[, glb_id_vars] %in%
glb_OOBent_df[, glb_id_vars], ".lcn"] <- "OOB"
dsp_class_dstrb <- function(obs_df, location_var, partition_var) {
xtab_df <- mycreate_xtab_df(obs_df, c(location_var, partition_var))
rownames(xtab_df) <- xtab_df[, location_var]
xtab_df <- xtab_df[, -grepl(location_var, names(xtab_df))]
print(xtab_df)
print(xtab_df / rowSums(xtab_df, na.rm=TRUE))
}
# Ensure proper splits by glb_rsp_var_raw & user-specified feature for OOB vs. new
dsp_class_dstrb(glb_entity_df, ".lcn", glb_rsp_var_raw)
## Popular.0 Popular.1 Popular.NA
## NA NA 1870
## Fit 3726 749 NA
## OOB 1713 344 NA
## Popular.0 Popular.1 Popular.NA
## NA NA 1
## Fit 0.8326257 0.1673743 NA
## OOB 0.8327662 0.1672338 NA
newent_ctgry_df <- mycreate_sqlxtab_df(subset(glb_entity_df, .src == "Test"),
"NewsDesk.nb")
OOBent_ctgry_df <- mycreate_sqlxtab_df(subset(glb_entity_df, .lcn == "OOB"),
"NewsDesk.nb")
glb_ctgry_df <- merge(newent_ctgry_df, OOBent_ctgry_df, by="NewsDesk.nb", all=TRUE,
suffixes=c(".Tst", ".OOB"))
glb_ctgry_df$.freqRatio.Tst <- glb_ctgry_df$.n.Tst / sum(glb_ctgry_df$.n.Tst, na.rm=TRUE)
glb_ctgry_df$.freqRatio.OOB <- glb_ctgry_df$.n.OOB / sum(glb_ctgry_df$.n.OOB, na.rm=TRUE)
print(orderBy(~-.freqRatio.Tst-.freqRatio.OOB, glb_ctgry_df))
## NewsDesk.nb .n.Tst .n.OOB .freqRatio.Tst .freqRatio.OOB
## 1 Business 500 516 0.267379679 0.2508507535
## 9 myMisc:: 247 297 0.132085561 0.1443850267
## 2 Culture 243 203 0.129946524 0.0986874088
## 11 OpEd 205 217 0.109625668 0.1054934370
## 20 TStyle 107 239 0.057219251 0.1161886242
## 5 Foreign 107 121 0.057219251 0.0588235294
## 8 myEducation 93 118 0.049732620 0.0573650948
## 16 Styles 76 75 0.040641711 0.0364608653
## 7 Metro 66 57 0.035294118 0.0277102577
## 14 Science 57 66 0.030481283 0.0320855615
## 10 myMultimedia 53 38 0.028342246 0.0184735051
## 19 Travel 31 34 0.016577540 0.0165289256
## 3 Daily Clip Report:: 22 18 0.011764706 0.0087506077
## 18 Today in Politics:: 21 14 0.011229947 0.0068060282
## 4 First Draft:: 14 18 0.007486631 0.0087506077
## 21 Verbatim:: 12 11 0.006417112 0.0053475936
## 13 Reporter's Notebook:: 7 1 0.003743316 0.0004861449
## 12 Readers Respond:: 4 2 0.002139037 0.0009722897
## 6 Magazine 3 10 0.001604278 0.0048614487
## 17 The Daily Gift:: 2 1 0.001069519 0.0004861449
## 15 Sports NA 1 NA 0.0004861449
# dsp_class_dstrb(glb_entity_df, ".src", "NewsDesk.nb")
# dsp_class_dstrb(glb_entity_df, ".lcn", "NewsDesk.nb")
# Run this line by line
print("glb_feats_df:"); print(dim(glb_feats_df))
## [1] "glb_feats_df:"
## [1] 94 7
sav_feats_df <- glb_feats_df
glb_feats_df <- sav_feats_df
glb_feats_df[, "rsp_var_raw"] <- FALSE
glb_feats_df[glb_feats_df$id == glb_rsp_var_raw, "rsp_var_raw"] <- TRUE
glb_feats_df$exclude.as.feat <- (glb_feats_df$exclude.as.feat == 1)
if (!is.null(glb_id_vars) && glb_id_vars != ".rownames")
glb_feats_df[glb_feats_df$id %in% glb_id_vars, "id_var"] <- TRUE
add_feats_df <- data.frame(id=glb_rsp_var, exclude.as.feat=TRUE, rsp_var=TRUE)
row.names(add_feats_df) <- add_feats_df$id; print(add_feats_df)
## id exclude.as.feat rsp_var
## Popular.fctr Popular.fctr TRUE TRUE
glb_feats_df <- myrbind_df(glb_feats_df, add_feats_df)
print(subset(glb_feats_df, rsp_var_raw | rsp_var | id_var))
## id cor.y exclude.as.feat cor.y.abs cor.high.X
## Popular Popular 1.00000000 TRUE 1.00000000 <NA>
## UniqueID UniqueID 0.01182492 TRUE 0.01182492 <NA>
## Popular.fctr Popular.fctr NA TRUE NA <NA>
## is.ConditionalX.y is.cor.y.abs.low rsp_var_raw id_var rsp_var
## Popular NA FALSE TRUE NA NA
## UniqueID NA FALSE FALSE TRUE NA
## Popular.fctr NA NA NA NA TRUE
print("glb_feats_df vs. glb_entity_df: ");
## [1] "glb_feats_df vs. glb_entity_df: "
print(setdiff(glb_feats_df$id, names(glb_entity_df)))
## character(0)
print("glb_entity_df vs. glb_feats_df: ");
## [1] "glb_entity_df vs. glb_feats_df: "
# Ensure these are only chr vars
print(setdiff(setdiff(names(glb_entity_df), glb_feats_df$id),
myfind_chr_cols_df(glb_entity_df)))
## character(0)
#print(setdiff(setdiff(names(glb_entity_df), glb_exclude_vars_as_features),
# glb_feats_df$id))
print("glb_entity_df: "); print(dim(glb_entity_df))
## [1] "glb_entity_df: "
## [1] 8402 108
print("glb_trnent_df: "); print(dim(glb_trnent_df))
## [1] "glb_trnent_df: "
## [1] 6532 107
print("glb_fitent_df: "); print(dim(glb_fitent_df))
## [1] "glb_fitent_df: "
## [1] 4475 107
print("glb_OOBent_df: "); print(dim(glb_OOBent_df))
## [1] "glb_OOBent_df: "
## [1] 2057 107
print("glb_newent_df: "); print(dim(glb_newent_df))
## [1] "glb_newent_df: "
## [1] 1870 107
# sav_entity_df <- glb_entity_df
# glb_entity_df <- sav_entity_df
# # Does not handle NULL or length(glb_id_vars) > 1
# glb_entity_df$.src.trn <- 0
# glb_entity_df[glb_entity_df[, glb_id_vars] %in% glb_trnent_df[, glb_id_vars],
# ".src.trn"] <- 1
# glb_entity_df$.src.fit <- 0
# glb_entity_df[glb_entity_df[, glb_id_vars] %in% glb_fitent_df[, glb_id_vars],
# ".src.fit"] <- 1
# glb_entity_df$.src.OOB <- 0
# glb_entity_df[glb_entity_df[, glb_id_vars] %in% glb_OOBent_df[, glb_id_vars],
# ".src.OOB"] <- 1
# glb_entity_df$.src.new <- 0
# glb_entity_df[glb_entity_df[, glb_id_vars] %in% glb_newent_df[, glb_id_vars],
# ".src.new"] <- 1
# #print(unique(glb_entity_df[, ".src.trn"]))
# write_cols <- c(glb_feats_df$id,
# ".src.trn", ".src.fit", ".src.OOB", ".src.new")
# glb_entity_df <- glb_entity_df[, write_cols]
#
# tmp_feats_df <- glb_feats_df
# tmp_entity_df <- glb_entity_df
save(glb_feats_df,
glb_entity_df, #glb_trnent_df, glb_fitent_df, glb_OOBent_df, glb_newent_df,
file=paste0(glb_out_pfx, "blddfs_dsk.RData"))
# load(paste0(glb_out_pfx, "blddfs_dsk.RData"))
# if (!all.equal(tmp_feats_df, glb_feats_df))
# stop("glb_feats_df r/w not working")
# if (!all.equal(tmp_entity_df, glb_entity_df))
# stop("glb_entity_df r/w not working")
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 8 partition.data.training 5 0 165.032 166.466 1.434
## 9 fit.models 6 0 166.467 NA NA
6.0: fit models# load(paste0(glb_out_pfx, "dsk.RData"))
# keep_cols <- setdiff(names(glb_entity_df),
# grep("^.src", names(glb_entity_df), value=TRUE))
# glb_trnent_df <- glb_entity_df[glb_entity_df$.src.trn == 1, keep_cols]
# glb_fitent_df <- glb_entity_df[glb_entity_df$.src.fit == 1, keep_cols]
# glb_OOBent_df <- glb_entity_df[glb_entity_df$.src.OOB == 1, keep_cols]
# glb_newent_df <- glb_entity_df[glb_entity_df$.src.new == 1, keep_cols]
#
# glb_models_lst <- list(); glb_models_df <- data.frame()
#
if (glb_is_classification && glb_is_binomial &&
(length(unique(glb_fitent_df[, glb_rsp_var])) < 2))
stop("glb_fitent_df$", glb_rsp_var, ": contains less than 2 unique values: ",
paste0(unique(glb_fitent_df[, glb_rsp_var]), collapse=", "))
max_cor_y_x_var <- orderBy(~ -cor.y.abs,
subset(glb_feats_df, (exclude.as.feat == 0) & !is.cor.y.abs.low))[1, "id"]
if (!is.null(glb_Baseline_mdl_var)) {
if ((max_cor_y_x_var != glb_Baseline_mdl_var) &
(glb_feats_df[max_cor_y_x_var, "cor.y.abs"] >
glb_feats_df[glb_Baseline_mdl_var, "cor.y.abs"]))
stop(max_cor_y_x_var, " has a lower correlation with ", glb_rsp_var,
" than the Baseline var: ", glb_Baseline_mdl_var)
}
glb_model_type <- ifelse(glb_is_regression, "regression", "classification")
# Baseline
if (!is.null(glb_Baseline_mdl_var))
ret_lst <- myfit_mdl_fn(model_id="Baseline", model_method="mybaseln_classfr",
indep_vars_vctr=glb_Baseline_mdl_var,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitent_df, OOB_df=glb_OOBent_df)
# Most Frequent Outcome "MFO" model: mean(y) for regression
# Not using caret's nullModel since model stats not avl
# Cannot use rpart for multinomial classification since it predicts non-MFO
ret_lst <- myfit_mdl(model_id="MFO",
model_method=ifelse(glb_is_regression, "lm", "myMFO_classfr"),
model_type=glb_model_type,
indep_vars_vctr=".rnorm",
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitent_df, OOB_df=glb_OOBent_df)
## [1] "fitting model: MFO.myMFO_classfr"
## [1] " indep_vars: .rnorm"
## Fitting parameter = none on full training set
## [1] "in MFO.Classifier$fit"
## [1] "unique.vals:"
## [1] N Y
## Levels: N Y
## [1] "unique.prob:"
## y
## N Y
## 0.8326257 0.1673743
## [1] "MFO.val:"
## [1] "N"
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 -none- numeric
## MFO.val 1 -none- character
## x.names 1 -none- character
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] " calling mypredict_mdl for fit:"
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
## [1] "entr MFO.Classifier$predict"
## [1] "exit MFO.Classifier$predict"
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.8326257 0.1673743
## 2 0.8326257 0.1673743
## 3 0.8326257 0.1673743
## 4 0.8326257 0.1673743
## 5 0.8326257 0.1673743
## 6 0.8326257 0.1673743
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.MFO.myMFO_classfr.N
## 1 N 3726
## 2 Y 749
## Prediction
## Reference N Y
## N 3726 0
## Y 749 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.326257e-01 0.000000e+00 8.213602e-01 8.434553e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 5.097571e-01 1.800616e-164
## [1] " calling mypredict_mdl for OOB:"
## [1] "entr MFO.Classifier$predict"
## [1] "exit MFO.Classifier$predict"
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.8326257 0.1673743
## 2 0.8326257 0.1673743
## 3 0.8326257 0.1673743
## 4 0.8326257 0.1673743
## 5 0.8326257 0.1673743
## 6 0.8326257 0.1673743
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.MFO.myMFO_classfr.N
## 1 N 1713
## 2 Y 344
## Prediction
## Reference N Y
## N 1713 0
## Y 344 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.327662e-01 0.000000e+00 8.159247e-01 8.486533e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 5.143944e-01 2.337097e-76
## model_id model_method feats max.nTuningRuns
## 1 MFO.myMFO_classfr myMFO_classfr .rnorm 0
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 0.694 0.003 0.5
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0 0.8326257
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.8213602 0.8434553 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0 0.8327662
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.8159247 0.8486533 0
if (glb_is_classification)
# "random" model - only for classification;
# none needed for regression since it is same as MFO
ret_lst <- myfit_mdl(model_id="Random", model_method="myrandom_classfr",
model_type=glb_model_type,
indep_vars_vctr=".rnorm",
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitent_df, OOB_df=glb_OOBent_df)
## [1] "fitting model: Random.myrandom_classfr"
## [1] " indep_vars: .rnorm"
## Fitting parameter = none on full training set
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 table numeric
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] " calling mypredict_mdl for fit:"
## [1] "in Random.Classifier$prob"
## threshold f.score
## 1 0.0 0.2867534
## 2 0.1 0.2867534
## 3 0.2 0.1683938
## 4 0.3 0.1683938
## 5 0.4 0.1683938
## 6 0.5 0.1683938
## 7 0.6 0.1683938
## 8 0.7 0.1683938
## 9 0.8 0.1683938
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.1000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Random.myrandom_classfr.Y
## 1 N 3726
## 2 Y 749
## Prediction
## Reference N Y
## N 0 3726
## Y 0 749
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.1673743 0.0000000 0.1565447 0.1786398 0.8326257
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## [1] " calling mypredict_mdl for OOB:"
## [1] "in Random.Classifier$prob"
## threshold f.score
## 1 0.0 0.2865473
## 2 0.1 0.2865473
## 3 0.2 0.1404011
## 4 0.3 0.1404011
## 5 0.4 0.1404011
## 6 0.5 0.1404011
## 7 0.6 0.1404011
## 8 0.7 0.1404011
## 9 0.8 0.1404011
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.1000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Random.myrandom_classfr.Y
## 1 N 1713
## 2 Y 344
## Prediction
## Reference N Y
## N 0 1713
## Y 0 344
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.1672338 0.0000000 0.1513467 0.1840753 0.8327662
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## model_id model_method feats max.nTuningRuns
## 1 Random.myrandom_classfr myrandom_classfr .rnorm 0
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 0.322 0.002 0.4975446
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.1 0.2867534 0.1673743
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.1565447 0.1786398 0 0.4821958
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.2865473 0.1672338
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.1513467 0.1840753 0
# Any models that have tuning parameters has "better" results with cross-validation
# (except rf) & "different" results for different outcome metrics
# Max.cor.Y
# Check impact of cv
# rpart is not a good candidate since caret does not optimize cp (only tuning parameter of rpart) well
ret_lst <- myfit_mdl(model_id="Max.cor.Y.cv.0",
model_method="rpart",
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_var,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitent_df, OOB_df=glb_OOBent_df)
## [1] "fitting model: Max.cor.Y.cv.0.rpart"
## [1] " indep_vars: WordCount.log"
## Loading required package: rpart
## Fitting cp = 0.00223 on full training set
## Loading required package: rpart.plot
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 4475
##
## CP nsplit rel error
## 1 0.002225189 0 1
##
## Node number 1: 4475 observations
## predicted class=N expected loss=0.1673743 P(node) =1
## class counts: 3726 749
## probabilities: 0.833 0.167
##
## n= 4475
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4475 749 N (0.8326257 0.1673743) *
## [1] " calling mypredict_mdl for fit:"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.cv.0.rpart.N
## 1 N 3726
## 2 Y 749
## Prediction
## Reference N Y
## N 3726 0
## Y 749 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.326257e-01 0.000000e+00 8.213602e-01 8.434553e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 5.097571e-01 1.800616e-164
## [1] " calling mypredict_mdl for OOB:"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.cv.0.rpart.N
## 1 N 1713
## 2 Y 344
## Prediction
## Reference N Y
## N 1713 0
## Y 344 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.327662e-01 0.000000e+00 8.159247e-01 8.486533e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 5.143944e-01 2.337097e-76
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.cv.0.rpart rpart WordCount.log 0
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 0.703 0.066 0.5
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0 0.8326257
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.8213602 0.8434553 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0 0.8327662
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.8159247 0.8486533 0
ret_lst <- myfit_mdl(model_id="Max.cor.Y.cv.0.cp.0",
model_method="rpart",
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_var,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitent_df, OOB_df=glb_OOBent_df,
n_cv_folds=0,
tune_models_df=data.frame(parameter="cp", min=0.0, max=0.0, by=0.1))
## [1] "fitting model: Max.cor.Y.cv.0.cp.0.rpart"
## [1] " indep_vars: WordCount.log"
## Fitting cp = 0 on full training set
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 4475
##
## CP nsplit rel error
## 1 0.0022251891 0 1.0000000
## 2 0.0020026702 13 0.9666222
## 3 0.0013351135 19 0.9519359
## 4 0.0008900757 39 0.9158879
## 5 0.0005340454 50 0.9052069
## 6 0.0002225189 55 0.9025367
## 7 0.0000000000 61 0.9012016
##
## Variable importance
## WordCount.log
## 100
##
## Node number 1: 4475 observations, complexity param=0.002225189
## predicted class=N expected loss=0.1673743 P(node) =1
## class counts: 3726 749
## probabilities: 0.833 0.167
## left son=2 (3276 obs) right son=3 (1199 obs)
## Primary splits:
## WordCount.log < 6.528688 to the left, improve=109.5997, (0 missing)
##
## Node number 2: 3276 observations
## predicted class=N expected loss=0.1004274 P(node) =0.732067
## class counts: 2947 329
## probabilities: 0.900 0.100
##
## Node number 3: 1199 observations, complexity param=0.002225189
## predicted class=N expected loss=0.3502919 P(node) =0.267933
## class counts: 779 420
## probabilities: 0.650 0.350
## left son=6 (193 obs) right son=7 (1006 obs)
## Primary splits:
## WordCount.log < 6.663771 to the left, improve=3.008125, (0 missing)
##
## Node number 6: 193 observations, complexity param=0.0008900757
## predicted class=N expected loss=0.2694301 P(node) =0.04312849
## class counts: 141 52
## probabilities: 0.731 0.269
## left son=12 (62 obs) right son=13 (131 obs)
## Primary splits:
## WordCount.log < 6.631343 to the right, improve=2.136379, (0 missing)
##
## Node number 7: 1006 observations, complexity param=0.002225189
## predicted class=N expected loss=0.3658052 P(node) =0.2248045
## class counts: 638 368
## probabilities: 0.634 0.366
## left son=14 (85 obs) right son=15 (921 obs)
## Primary splits:
## WordCount.log < 7.57327 to the right, improve=3.162874, (0 missing)
##
## Node number 12: 62 observations
## predicted class=N expected loss=0.1612903 P(node) =0.01385475
## class counts: 52 10
## probabilities: 0.839 0.161
##
## Node number 13: 131 observations, complexity param=0.0008900757
## predicted class=N expected loss=0.3206107 P(node) =0.02927374
## class counts: 89 42
## probabilities: 0.679 0.321
## left son=26 (121 obs) right son=27 (10 obs)
## Primary splits:
## WordCount.log < 6.535966 to the right, improve=0.6968015, (0 missing)
##
## Node number 14: 85 observations, complexity param=0.002225189
## predicted class=N expected loss=0.2352941 P(node) =0.01899441
## class counts: 65 20
## probabilities: 0.765 0.235
## left son=28 (77 obs) right son=29 (8 obs)
## Primary splits:
## WordCount.log < 8.229096 to the left, improve=4.679144, (0 missing)
##
## Node number 15: 921 observations, complexity param=0.002225189
## predicted class=N expected loss=0.3778502 P(node) =0.2058101
## class counts: 573 348
## probabilities: 0.622 0.378
## left son=30 (734 obs) right son=31 (187 obs)
## Primary splits:
## WordCount.log < 6.775937 to the right, improve=1.435366, (0 missing)
##
## Node number 26: 121 observations, complexity param=0.0008900757
## predicted class=N expected loss=0.3057851 P(node) =0.02703911
## class counts: 84 37
## probabilities: 0.694 0.306
## left son=52 (15 obs) right son=53 (106 obs)
## Primary splits:
## WordCount.log < 6.548935 to the left, improve=1.018442, (0 missing)
##
## Node number 27: 10 observations
## predicted class=N expected loss=0.5 P(node) =0.002234637
## class counts: 5 5
## probabilities: 0.500 0.500
##
## Node number 28: 77 observations
## predicted class=N expected loss=0.1818182 P(node) =0.0172067
## class counts: 63 14
## probabilities: 0.818 0.182
##
## Node number 29: 8 observations
## predicted class=Y expected loss=0.25 P(node) =0.001787709
## class counts: 2 6
## probabilities: 0.250 0.750
##
## Node number 30: 734 observations, complexity param=0.002225189
## predicted class=N expected loss=0.3637602 P(node) =0.1640223
## class counts: 467 267
## probabilities: 0.636 0.364
## left son=60 (11 obs) right son=61 (723 obs)
## Primary splits:
## WordCount.log < 6.782759 to the left, improve=2.955363, (0 missing)
##
## Node number 31: 187 observations, complexity param=0.002225189
## predicted class=N expected loss=0.4331551 P(node) =0.04178771
## class counts: 106 81
## probabilities: 0.567 0.433
## left son=62 (177 obs) right son=63 (10 obs)
## Primary splits:
## WordCount.log < 6.771362 to the left, improve=2.843566, (0 missing)
##
## Node number 52: 15 observations
## predicted class=N expected loss=0.1333333 P(node) =0.003351955
## class counts: 13 2
## probabilities: 0.867 0.133
##
## Node number 53: 106 observations, complexity param=0.0008900757
## predicted class=N expected loss=0.3301887 P(node) =0.02368715
## class counts: 71 35
## probabilities: 0.670 0.330
## left son=106 (87 obs) right son=107 (19 obs)
## Primary splits:
## WordCount.log < 6.566671 to the right, improve=1.780924, (0 missing)
##
## Node number 60: 11 observations
## predicted class=N expected loss=0 P(node) =0.002458101
## class counts: 11 0
## probabilities: 1.000 0.000
##
## Node number 61: 723 observations, complexity param=0.002225189
## predicted class=N expected loss=0.3692946 P(node) =0.1615642
## class counts: 456 267
## probabilities: 0.631 0.369
## left son=122 (515 obs) right son=123 (208 obs)
## Primary splits:
## WordCount.log < 7.162785 to the left, improve=1.689287, (0 missing)
##
## Node number 62: 177 observations, complexity param=0.002225189
## predicted class=N expected loss=0.4124294 P(node) =0.03955307
## class counts: 104 73
## probabilities: 0.588 0.412
## left son=124 (125 obs) right son=125 (52 obs)
## Primary splits:
## WordCount.log < 6.736373 to the left, improve=0.6877723, (0 missing)
##
## Node number 63: 10 observations
## predicted class=Y expected loss=0.2 P(node) =0.002234637
## class counts: 2 8
## probabilities: 0.200 0.800
##
## Node number 106: 87 observations, complexity param=0.0008900757
## predicted class=N expected loss=0.2873563 P(node) =0.01944134
## class counts: 62 25
## probabilities: 0.713 0.287
## left son=212 (41 obs) right son=213 (46 obs)
## Primary splits:
## WordCount.log < 6.597826 to the left, improve=1.319353, (0 missing)
##
## Node number 107: 19 observations
## predicted class=Y expected loss=0.4736842 P(node) =0.00424581
## class counts: 9 10
## probabilities: 0.474 0.526
##
## Node number 122: 515 observations, complexity param=0.001335113
## predicted class=N expected loss=0.3475728 P(node) =0.1150838
## class counts: 336 179
## probabilities: 0.652 0.348
## left son=244 (190 obs) right son=245 (325 obs)
## Primary splits:
## WordCount.log < 6.982399 to the right, improve=2.835815, (0 missing)
##
## Node number 123: 208 observations, complexity param=0.002225189
## predicted class=N expected loss=0.4230769 P(node) =0.04648045
## class counts: 120 88
## probabilities: 0.577 0.423
## left son=246 (199 obs) right son=247 (9 obs)
## Primary splits:
## WordCount.log < 7.17434 to the right, improve=2.367049, (0 missing)
##
## Node number 124: 125 observations, complexity param=0.00200267
## predicted class=N expected loss=0.384 P(node) =0.02793296
## class counts: 77 48
## probabilities: 0.616 0.384
## left son=248 (40 obs) right son=249 (85 obs)
## Primary splits:
## WordCount.log < 6.713563 to the right, improve=1.397765, (0 missing)
##
## Node number 125: 52 observations, complexity param=0.002225189
## predicted class=N expected loss=0.4807692 P(node) =0.01162011
## class counts: 27 25
## probabilities: 0.519 0.481
## left son=250 (40 obs) right son=251 (12 obs)
## Primary splits:
## WordCount.log < 6.745823 to the right, improve=2.261538, (0 missing)
##
## Node number 212: 41 observations
## predicted class=N expected loss=0.195122 P(node) =0.009162011
## class counts: 33 8
## probabilities: 0.805 0.195
##
## Node number 213: 46 observations, complexity param=0.0008900757
## predicted class=N expected loss=0.3695652 P(node) =0.01027933
## class counts: 29 17
## probabilities: 0.630 0.370
## left son=426 (33 obs) right son=427 (13 obs)
## Primary splits:
## WordCount.log < 6.605974 to the right, improve=2.190027, (0 missing)
##
## Node number 244: 190 observations, complexity param=0.0008900757
## predicted class=N expected loss=0.2789474 P(node) =0.0424581
## class counts: 137 53
## probabilities: 0.721 0.279
## left son=488 (8 obs) right son=489 (182 obs)
## Primary splits:
## WordCount.log < 7.158125 to the right, improve=1.299711, (0 missing)
##
## Node number 245: 325 observations, complexity param=0.001335113
## predicted class=N expected loss=0.3876923 P(node) =0.0726257
## class counts: 199 126
## probabilities: 0.612 0.388
## left son=490 (231 obs) right son=491 (94 obs)
## Primary splits:
## WordCount.log < 6.912741 to the left, improve=0.9243624, (0 missing)
##
## Node number 246: 199 observations, complexity param=0.002225189
## predicted class=N expected loss=0.4070352 P(node) =0.04446927
## class counts: 118 81
## probabilities: 0.593 0.407
## left son=492 (114 obs) right son=493 (85 obs)
## Primary splits:
## WordCount.log < 7.275172 to the right, improve=0.7959052, (0 missing)
##
## Node number 247: 9 observations
## predicted class=Y expected loss=0.2222222 P(node) =0.002011173
## class counts: 2 7
## probabilities: 0.222 0.778
##
## Node number 248: 40 observations
## predicted class=N expected loss=0.275 P(node) =0.008938547
## class counts: 29 11
## probabilities: 0.725 0.275
##
## Node number 249: 85 observations, complexity param=0.00200267
## predicted class=N expected loss=0.4352941 P(node) =0.01899441
## class counts: 48 37
## probabilities: 0.565 0.435
## left son=498 (56 obs) right son=499 (29 obs)
## Primary splits:
## WordCount.log < 6.698884 to the left, improve=1.193408, (0 missing)
##
## Node number 250: 40 observations
## predicted class=N expected loss=0.4 P(node) =0.008938547
## class counts: 24 16
## probabilities: 0.600 0.400
##
## Node number 251: 12 observations
## predicted class=Y expected loss=0.25 P(node) =0.002681564
## class counts: 3 9
## probabilities: 0.250 0.750
##
## Node number 426: 33 observations
## predicted class=N expected loss=0.2727273 P(node) =0.007374302
## class counts: 24 9
## probabilities: 0.727 0.273
##
## Node number 427: 13 observations
## predicted class=Y expected loss=0.3846154 P(node) =0.002905028
## class counts: 5 8
## probabilities: 0.385 0.615
##
## Node number 488: 8 observations
## predicted class=N expected loss=0 P(node) =0.001787709
## class counts: 8 0
## probabilities: 1.000 0.000
##
## Node number 489: 182 observations, complexity param=0.0008900757
## predicted class=N expected loss=0.2912088 P(node) =0.04067039
## class counts: 129 53
## probabilities: 0.709 0.291
## left son=978 (127 obs) right son=979 (55 obs)
## Primary splits:
## WordCount.log < 7.088408 to the left, improve=1.865726, (0 missing)
##
## Node number 490: 231 observations, complexity param=0.001335113
## predicted class=N expected loss=0.3636364 P(node) =0.05162011
## class counts: 147 84
## probabilities: 0.636 0.364
## left son=980 (36 obs) right son=981 (195 obs)
## Primary splits:
## WordCount.log < 6.892134 to the right, improve=1.705672, (0 missing)
##
## Node number 491: 94 observations, complexity param=0.001335113
## predicted class=N expected loss=0.4468085 P(node) =0.02100559
## class counts: 52 42
## probabilities: 0.553 0.447
## left son=982 (63 obs) right son=983 (31 obs)
## Primary splits:
## WordCount.log < 6.935857 to the right, improve=0.9545162, (0 missing)
##
## Node number 492: 114 observations, complexity param=0.0005340454
## predicted class=N expected loss=0.3684211 P(node) =0.02547486
## class counts: 72 42
## probabilities: 0.632 0.368
## left son=984 (35 obs) right son=985 (79 obs)
## Primary splits:
## WordCount.log < 7.345687 to the left, improve=1.250823, (0 missing)
##
## Node number 493: 85 observations, complexity param=0.002225189
## predicted class=N expected loss=0.4588235 P(node) =0.01899441
## class counts: 46 39
## probabilities: 0.541 0.459
## left son=986 (69 obs) right son=987 (16 obs)
## Primary splits:
## WordCount.log < 7.257355 to the left, improve=1.088576, (0 missing)
##
## Node number 498: 56 observations, complexity param=0.00200267
## predicted class=N expected loss=0.375 P(node) =0.01251397
## class counts: 35 21
## probabilities: 0.625 0.375
## left son=996 (12 obs) right son=997 (44 obs)
## Primary splits:
## WordCount.log < 6.691463 to the right, improve=1.325758, (0 missing)
##
## Node number 499: 29 observations
## predicted class=Y expected loss=0.4482759 P(node) =0.006480447
## class counts: 13 16
## probabilities: 0.448 0.552
##
## Node number 978: 127 observations, complexity param=0.0002225189
## predicted class=N expected loss=0.2440945 P(node) =0.02837989
## class counts: 96 31
## probabilities: 0.756 0.244
## left son=1956 (9 obs) right son=1957 (118 obs)
## Primary splits:
## WordCount.log < 7.080026 to the right, improve=1.154277, (0 missing)
##
## Node number 979: 55 observations, complexity param=0.0008900757
## predicted class=N expected loss=0.4 P(node) =0.0122905
## class counts: 33 22
## probabilities: 0.600 0.400
## left son=1958 (43 obs) right son=1959 (12 obs)
## Primary splits:
## WordCount.log < 7.100027 to the right, improve=1.031783, (0 missing)
##
## Node number 980: 36 observations
## predicted class=N expected loss=0.2222222 P(node) =0.008044693
## class counts: 28 8
## probabilities: 0.778 0.222
##
## Node number 981: 195 observations, complexity param=0.001335113
## predicted class=N expected loss=0.3897436 P(node) =0.04357542
## class counts: 119 76
## probabilities: 0.610 0.390
## left son=1962 (185 obs) right son=1963 (10 obs)
## Primary splits:
## WordCount.log < 6.884486 to the left, improve=0.9319473, (0 missing)
##
## Node number 982: 63 observations, complexity param=0.001335113
## predicted class=N expected loss=0.3968254 P(node) =0.01407821
## class counts: 38 25
## probabilities: 0.603 0.397
## left son=1964 (10 obs) right son=1965 (53 obs)
## Primary splits:
## WordCount.log < 6.942156 to the left, improve=2.094579, (0 missing)
##
## Node number 983: 31 observations, complexity param=0.001335113
## predicted class=Y expected loss=0.4516129 P(node) =0.006927374
## class counts: 14 17
## probabilities: 0.452 0.548
## left son=1966 (24 obs) right son=1967 (7 obs)
## Primary splits:
## WordCount.log < 6.928048 to the left, improve=0.4976959, (0 missing)
##
## Node number 984: 35 observations
## predicted class=N expected loss=0.2571429 P(node) =0.007821229
## class counts: 26 9
## probabilities: 0.743 0.257
##
## Node number 985: 79 observations, complexity param=0.0005340454
## predicted class=N expected loss=0.4177215 P(node) =0.01765363
## class counts: 46 33
## probabilities: 0.582 0.418
## left son=1970 (67 obs) right son=1971 (12 obs)
## Primary splits:
## WordCount.log < 7.543801 to the left, improve=0.1915738, (0 missing)
##
## Node number 986: 69 observations, complexity param=0.00200267
## predicted class=N expected loss=0.4202899 P(node) =0.01541899
## class counts: 40 29
## probabilities: 0.580 0.420
## left son=1972 (14 obs) right son=1973 (55 obs)
## Primary splits:
## WordCount.log < 7.191805 to the left, improve=0.6361754, (0 missing)
##
## Node number 987: 16 observations
## predicted class=Y expected loss=0.375 P(node) =0.003575419
## class counts: 6 10
## probabilities: 0.375 0.625
##
## Node number 996: 12 observations
## predicted class=N expected loss=0.1666667 P(node) =0.002681564
## class counts: 10 2
## probabilities: 0.833 0.167
##
## Node number 997: 44 observations, complexity param=0.00200267
## predicted class=N expected loss=0.4318182 P(node) =0.009832402
## class counts: 25 19
## probabilities: 0.568 0.432
## left son=1994 (35 obs) right son=1995 (9 obs)
## Primary splits:
## WordCount.log < 6.685236 to the left, improve=2.708369, (0 missing)
##
## Node number 1956: 9 observations
## predicted class=N expected loss=0 P(node) =0.002011173
## class counts: 9 0
## probabilities: 1.000 0.000
##
## Node number 1957: 118 observations, complexity param=0.0002225189
## predicted class=N expected loss=0.2627119 P(node) =0.02636872
## class counts: 87 31
## probabilities: 0.737 0.263
## left son=3914 (98 obs) right son=3915 (20 obs)
## Primary splits:
## WordCount.log < 7.060476 to the left, improve=0.9077828, (0 missing)
##
## Node number 1958: 43 observations, complexity param=0.0008900757
## predicted class=N expected loss=0.3488372 P(node) =0.009608939
## class counts: 28 15
## probabilities: 0.651 0.349
## left son=3916 (11 obs) right son=3917 (32 obs)
## Primary splits:
## WordCount.log < 7.11192 to the left, improve=0.8246564, (0 missing)
##
## Node number 1959: 12 observations
## predicted class=Y expected loss=0.4166667 P(node) =0.002681564
## class counts: 5 7
## probabilities: 0.417 0.583
##
## Node number 1962: 185 observations, complexity param=0.001335113
## predicted class=N expected loss=0.3783784 P(node) =0.04134078
## class counts: 115 70
## probabilities: 0.622 0.378
## left son=3924 (164 obs) right son=3925 (21 obs)
## Primary splits:
## WordCount.log < 6.790659 to the right, improve=1.002056, (0 missing)
##
## Node number 1963: 10 observations
## predicted class=Y expected loss=0.4 P(node) =0.002234637
## class counts: 4 6
## probabilities: 0.400 0.600
##
## Node number 1964: 10 observations
## predicted class=N expected loss=0.1 P(node) =0.002234637
## class counts: 9 1
## probabilities: 0.900 0.100
##
## Node number 1965: 53 observations, complexity param=0.001335113
## predicted class=N expected loss=0.4528302 P(node) =0.01184358
## class counts: 29 24
## probabilities: 0.547 0.453
## left son=3930 (21 obs) right son=3931 (32 obs)
## Primary splits:
## WordCount.log < 6.956069 to the left, improve=0.359389, (0 missing)
##
## Node number 1966: 24 observations, complexity param=0.001335113
## predicted class=N expected loss=0.5 P(node) =0.005363128
## class counts: 12 12
## probabilities: 0.500 0.500
## left son=3932 (9 obs) right son=3933 (15 obs)
## Primary splits:
## WordCount.log < 6.920178 to the right, improve=0.8, (0 missing)
##
## Node number 1967: 7 observations
## predicted class=Y expected loss=0.2857143 P(node) =0.001564246
## class counts: 2 5
## probabilities: 0.286 0.714
##
## Node number 1970: 67 observations, complexity param=0.0005340454
## predicted class=N expected loss=0.4029851 P(node) =0.01497207
## class counts: 40 27
## probabilities: 0.597 0.403
## left son=3940 (8 obs) right son=3941 (59 obs)
## Primary splits:
## WordCount.log < 7.506042 to the right, improve=0.4252466, (0 missing)
##
## Node number 1971: 12 observations
## predicted class=N expected loss=0.5 P(node) =0.002681564
## class counts: 6 6
## probabilities: 0.500 0.500
##
## Node number 1972: 14 observations
## predicted class=N expected loss=0.2857143 P(node) =0.003128492
## class counts: 10 4
## probabilities: 0.714 0.286
##
## Node number 1973: 55 observations, complexity param=0.00200267
## predicted class=N expected loss=0.4545455 P(node) =0.0122905
## class counts: 30 25
## probabilities: 0.545 0.455
## left son=3946 (40 obs) right son=3947 (15 obs)
## Primary splits:
## WordCount.log < 7.20897 to the right, improve=0.8727273, (0 missing)
##
## Node number 1994: 35 observations
## predicted class=N expected loss=0.3428571 P(node) =0.007821229
## class counts: 23 12
## probabilities: 0.657 0.343
##
## Node number 1995: 9 observations
## predicted class=Y expected loss=0.2222222 P(node) =0.002011173
## class counts: 2 7
## probabilities: 0.222 0.778
##
## Node number 3914: 98 observations, complexity param=0.0002225189
## predicted class=N expected loss=0.2346939 P(node) =0.02189944
## class counts: 75 23
## probabilities: 0.765 0.235
## left son=7828 (27 obs) right son=7829 (71 obs)
## Primary splits:
## WordCount.log < 7.044469 to the right, improve=0.5582809, (0 missing)
##
## Node number 3915: 20 observations
## predicted class=N expected loss=0.4 P(node) =0.004469274
## class counts: 12 8
## probabilities: 0.600 0.400
##
## Node number 3916: 11 observations
## predicted class=N expected loss=0.1818182 P(node) =0.002458101
## class counts: 9 2
## probabilities: 0.818 0.182
##
## Node number 3917: 32 observations, complexity param=0.0008900757
## predicted class=N expected loss=0.40625 P(node) =0.007150838
## class counts: 19 13
## probabilities: 0.594 0.406
## left son=7834 (22 obs) right son=7835 (10 obs)
## Primary splits:
## WordCount.log < 7.127693 to the right, improve=1.092045, (0 missing)
##
## Node number 3924: 164 observations, complexity param=0.001335113
## predicted class=N expected loss=0.3597561 P(node) =0.03664804
## class counts: 105 59
## probabilities: 0.640 0.360
## left son=7848 (18 obs) right son=7849 (146 obs)
## Primary splits:
## WordCount.log < 6.873163 to the right, improve=0.7649144, (0 missing)
##
## Node number 3925: 21 observations
## predicted class=Y expected loss=0.4761905 P(node) =0.004692737
## class counts: 10 11
## probabilities: 0.476 0.524
##
## Node number 3930: 21 observations
## predicted class=N expected loss=0.3809524 P(node) =0.004692737
## class counts: 13 8
## probabilities: 0.619 0.381
##
## Node number 3931: 32 observations, complexity param=0.001335113
## predicted class=N expected loss=0.5 P(node) =0.007150838
## class counts: 16 16
## probabilities: 0.500 0.500
## left son=7862 (23 obs) right son=7863 (9 obs)
## Primary splits:
## WordCount.log < 6.96319 to the right, improve=1.932367, (0 missing)
##
## Node number 3932: 9 observations
## predicted class=N expected loss=0.3333333 P(node) =0.002011173
## class counts: 6 3
## probabilities: 0.667 0.333
##
## Node number 3933: 15 observations
## predicted class=Y expected loss=0.4 P(node) =0.003351955
## class counts: 6 9
## probabilities: 0.400 0.600
##
## Node number 3940: 8 observations
## predicted class=N expected loss=0.25 P(node) =0.001787709
## class counts: 6 2
## probabilities: 0.750 0.250
##
## Node number 3941: 59 observations, complexity param=0.0005340454
## predicted class=N expected loss=0.4237288 P(node) =0.01318436
## class counts: 34 25
## probabilities: 0.576 0.424
## left son=7882 (27 obs) right son=7883 (32 obs)
## Primary splits:
## WordCount.log < 7.405491 to the left, improve=0.2834667, (0 missing)
##
## Node number 3946: 40 observations, complexity param=0.001335113
## predicted class=N expected loss=0.4 P(node) =0.008938547
## class counts: 24 16
## probabilities: 0.600 0.400
## left son=7892 (7 obs) right son=7893 (33 obs)
## Primary splits:
## WordCount.log < 7.220371 to the left, improve=1.122078, (0 missing)
##
## Node number 3947: 15 observations
## predicted class=Y expected loss=0.4 P(node) =0.003351955
## class counts: 6 9
## probabilities: 0.400 0.600
##
## Node number 7828: 27 observations
## predicted class=N expected loss=0.1481481 P(node) =0.00603352
## class counts: 23 4
## probabilities: 0.852 0.148
##
## Node number 7829: 71 observations, complexity param=0.0002225189
## predicted class=N expected loss=0.2676056 P(node) =0.01586592
## class counts: 52 19
## probabilities: 0.732 0.268
## left son=15658 (52 obs) right son=15659 (19 obs)
## Primary splits:
## WordCount.log < 7.025094 to the left, improve=0.5273422, (0 missing)
##
## Node number 7834: 22 observations
## predicted class=N expected loss=0.3181818 P(node) =0.004916201
## class counts: 15 7
## probabilities: 0.682 0.318
##
## Node number 7835: 10 observations
## predicted class=Y expected loss=0.4 P(node) =0.002234637
## class counts: 4 6
## probabilities: 0.400 0.600
##
## Node number 7848: 18 observations
## predicted class=N expected loss=0.2222222 P(node) =0.004022346
## class counts: 14 4
## probabilities: 0.778 0.222
##
## Node number 7849: 146 observations, complexity param=0.001335113
## predicted class=N expected loss=0.3767123 P(node) =0.0326257
## class counts: 91 55
## probabilities: 0.623 0.377
## left son=15698 (130 obs) right son=15699 (16 obs)
## Primary splits:
## WordCount.log < 6.862757 to the left, improve=2.21549, (0 missing)
##
## Node number 7862: 23 observations
## predicted class=N expected loss=0.3913043 P(node) =0.005139665
## class counts: 14 9
## probabilities: 0.609 0.391
##
## Node number 7863: 9 observations
## predicted class=Y expected loss=0.2222222 P(node) =0.002011173
## class counts: 2 7
## probabilities: 0.222 0.778
##
## Node number 7882: 27 observations
## predicted class=N expected loss=0.3703704 P(node) =0.00603352
## class counts: 17 10
## probabilities: 0.630 0.370
##
## Node number 7883: 32 observations, complexity param=0.0005340454
## predicted class=N expected loss=0.46875 P(node) =0.007150838
## class counts: 17 15
## probabilities: 0.531 0.469
## left son=15766 (16 obs) right son=15767 (16 obs)
## Primary splits:
## WordCount.log < 7.444526 to the right, improve=0.5625, (0 missing)
##
## Node number 7892: 7 observations
## predicted class=N expected loss=0.1428571 P(node) =0.001564246
## class counts: 6 1
## probabilities: 0.857 0.143
##
## Node number 7893: 33 observations, complexity param=0.001335113
## predicted class=N expected loss=0.4545455 P(node) =0.007374302
## class counts: 18 15
## probabilities: 0.545 0.455
## left son=15786 (15 obs) right son=15787 (18 obs)
## Primary splits:
## WordCount.log < 7.238497 to the right, improve=0.8080808, (0 missing)
##
## Node number 15658: 52 observations, complexity param=0.0002225189
## predicted class=N expected loss=0.2307692 P(node) =0.01162011
## class counts: 40 12
## probabilities: 0.769 0.231
## left son=31316 (11 obs) right son=31317 (41 obs)
## Primary splits:
## WordCount.log < 7.013015 to the right, improve=1.485929, (0 missing)
##
## Node number 15659: 19 observations
## predicted class=N expected loss=0.3684211 P(node) =0.00424581
## class counts: 12 7
## probabilities: 0.632 0.368
##
## Node number 15698: 130 observations, complexity param=0.001335113
## predicted class=N expected loss=0.3461538 P(node) =0.02905028
## class counts: 85 45
## probabilities: 0.654 0.346
## left son=31396 (32 obs) right son=31397 (98 obs)
## Primary splits:
## WordCount.log < 6.843217 to the right, improve=0.7849294, (0 missing)
##
## Node number 15699: 16 observations
## predicted class=Y expected loss=0.375 P(node) =0.003575419
## class counts: 6 10
## probabilities: 0.375 0.625
##
## Node number 15766: 16 observations
## predicted class=N expected loss=0.375 P(node) =0.003575419
## class counts: 10 6
## probabilities: 0.625 0.375
##
## Node number 15767: 16 observations
## predicted class=Y expected loss=0.4375 P(node) =0.003575419
## class counts: 7 9
## probabilities: 0.437 0.562
##
## Node number 15786: 15 observations
## predicted class=N expected loss=0.3333333 P(node) =0.003351955
## class counts: 10 5
## probabilities: 0.667 0.333
##
## Node number 15787: 18 observations
## predicted class=Y expected loss=0.4444444 P(node) =0.004022346
## class counts: 8 10
## probabilities: 0.444 0.556
##
## Node number 31316: 11 observations
## predicted class=N expected loss=0 P(node) =0.002458101
## class counts: 11 0
## probabilities: 1.000 0.000
##
## Node number 31317: 41 observations, complexity param=0.0002225189
## predicted class=N expected loss=0.2926829 P(node) =0.009162011
## class counts: 29 12
## probabilities: 0.707 0.293
## left son=62634 (30 obs) right son=62635 (11 obs)
## Primary splits:
## WordCount.log < 7.004882 to the left, improve=1.921064, (0 missing)
##
## Node number 31396: 32 observations
## predicted class=N expected loss=0.25 P(node) =0.007150838
## class counts: 24 8
## probabilities: 0.750 0.250
##
## Node number 31397: 98 observations, complexity param=0.001335113
## predicted class=N expected loss=0.377551 P(node) =0.02189944
## class counts: 61 37
## probabilities: 0.622 0.378
## left son=62794 (83 obs) right son=62795 (15 obs)
## Primary splits:
## WordCount.log < 6.836796 to the left, improve=1.752791, (0 missing)
##
## Node number 62634: 30 observations
## predicted class=N expected loss=0.2 P(node) =0.006703911
## class counts: 24 6
## probabilities: 0.800 0.200
##
## Node number 62635: 11 observations
## predicted class=Y expected loss=0.4545455 P(node) =0.002458101
## class counts: 5 6
## probabilities: 0.455 0.545
##
## Node number 62794: 83 observations, complexity param=0.001335113
## predicted class=N expected loss=0.3373494 P(node) =0.01854749
## class counts: 55 28
## probabilities: 0.663 0.337
## left son=125588 (11 obs) right son=125589 (72 obs)
## Primary splits:
## WordCount.log < 6.829253 to the right, improve=0.6134842, (0 missing)
##
## Node number 62795: 15 observations
## predicted class=Y expected loss=0.4 P(node) =0.003351955
## class counts: 6 9
## probabilities: 0.400 0.600
##
## Node number 125588: 11 observations
## predicted class=N expected loss=0.1818182 P(node) =0.002458101
## class counts: 9 2
## probabilities: 0.818 0.182
##
## Node number 125589: 72 observations, complexity param=0.001335113
## predicted class=N expected loss=0.3611111 P(node) =0.01608939
## class counts: 46 26
## probabilities: 0.639 0.361
## left son=251178 (29 obs) right son=251179 (43 obs)
## Primary splits:
## WordCount.log < 6.807382 to the left, improve=0.7057828, (0 missing)
##
## Node number 251178: 29 observations
## predicted class=N expected loss=0.2758621 P(node) =0.006480447
## class counts: 21 8
## probabilities: 0.724 0.276
##
## Node number 251179: 43 observations, complexity param=0.001335113
## predicted class=N expected loss=0.4186047 P(node) =0.009608939
## class counts: 25 18
## probabilities: 0.581 0.419
## left son=502358 (35 obs) right son=502359 (8 obs)
## Primary splits:
## WordCount.log < 6.810693 to the right, improve=2.158804, (0 missing)
##
## Node number 502358: 35 observations
## predicted class=N expected loss=0.3428571 P(node) =0.007821229
## class counts: 23 12
## probabilities: 0.657 0.343
##
## Node number 502359: 8 observations
## predicted class=Y expected loss=0.25 P(node) =0.001787709
## class counts: 2 6
## probabilities: 0.250 0.750
##
## n= 4475
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4475 749 N (0.8326257 0.1673743)
## 2) WordCount.log< 6.528688 3276 329 N (0.8995726 0.1004274) *
## 3) WordCount.log>=6.528688 1199 420 N (0.6497081 0.3502919)
## 6) WordCount.log< 6.663771 193 52 N (0.7305699 0.2694301)
## 12) WordCount.log>=6.631343 62 10 N (0.8387097 0.1612903) *
## 13) WordCount.log< 6.631343 131 42 N (0.6793893 0.3206107)
## 26) WordCount.log>=6.535966 121 37 N (0.6942149 0.3057851)
## 52) WordCount.log< 6.548935 15 2 N (0.8666667 0.1333333) *
## 53) WordCount.log>=6.548935 106 35 N (0.6698113 0.3301887)
## 106) WordCount.log>=6.566671 87 25 N (0.7126437 0.2873563)
## 212) WordCount.log< 6.597826 41 8 N (0.8048780 0.1951220) *
## 213) WordCount.log>=6.597826 46 17 N (0.6304348 0.3695652)
## 426) WordCount.log>=6.605974 33 9 N (0.7272727 0.2727273) *
## 427) WordCount.log< 6.605974 13 5 Y (0.3846154 0.6153846) *
## 107) WordCount.log< 6.566671 19 9 Y (0.4736842 0.5263158) *
## 27) WordCount.log< 6.535966 10 5 N (0.5000000 0.5000000) *
## 7) WordCount.log>=6.663771 1006 368 N (0.6341948 0.3658052)
## 14) WordCount.log>=7.57327 85 20 N (0.7647059 0.2352941)
## 28) WordCount.log< 8.229096 77 14 N (0.8181818 0.1818182) *
## 29) WordCount.log>=8.229096 8 2 Y (0.2500000 0.7500000) *
## 15) WordCount.log< 7.57327 921 348 N (0.6221498 0.3778502)
## 30) WordCount.log>=6.775937 734 267 N (0.6362398 0.3637602)
## 60) WordCount.log< 6.782759 11 0 N (1.0000000 0.0000000) *
## 61) WordCount.log>=6.782759 723 267 N (0.6307054 0.3692946)
## 122) WordCount.log< 7.162785 515 179 N (0.6524272 0.3475728)
## 244) WordCount.log>=6.982399 190 53 N (0.7210526 0.2789474)
## 488) WordCount.log>=7.158125 8 0 N (1.0000000 0.0000000) *
## 489) WordCount.log< 7.158125 182 53 N (0.7087912 0.2912088)
## 978) WordCount.log< 7.088408 127 31 N (0.7559055 0.2440945)
## 1956) WordCount.log>=7.080026 9 0 N (1.0000000 0.0000000) *
## 1957) WordCount.log< 7.080026 118 31 N (0.7372881 0.2627119)
## 3914) WordCount.log< 7.060476 98 23 N (0.7653061 0.2346939)
## 7828) WordCount.log>=7.044469 27 4 N (0.8518519 0.1481481) *
## 7829) WordCount.log< 7.044469 71 19 N (0.7323944 0.2676056)
## 15658) WordCount.log< 7.025094 52 12 N (0.7692308 0.2307692)
## 31316) WordCount.log>=7.013015 11 0 N (1.0000000 0.0000000) *
## 31317) WordCount.log< 7.013015 41 12 N (0.7073171 0.2926829)
## 62634) WordCount.log< 7.004882 30 6 N (0.8000000 0.2000000) *
## 62635) WordCount.log>=7.004882 11 5 Y (0.4545455 0.5454545) *
## 15659) WordCount.log>=7.025094 19 7 N (0.6315789 0.3684211) *
## 3915) WordCount.log>=7.060476 20 8 N (0.6000000 0.4000000) *
## 979) WordCount.log>=7.088408 55 22 N (0.6000000 0.4000000)
## 1958) WordCount.log>=7.100027 43 15 N (0.6511628 0.3488372)
## 3916) WordCount.log< 7.11192 11 2 N (0.8181818 0.1818182) *
## 3917) WordCount.log>=7.11192 32 13 N (0.5937500 0.4062500)
## 7834) WordCount.log>=7.127693 22 7 N (0.6818182 0.3181818) *
## 7835) WordCount.log< 7.127693 10 4 Y (0.4000000 0.6000000) *
## 1959) WordCount.log< 7.100027 12 5 Y (0.4166667 0.5833333) *
## 245) WordCount.log< 6.982399 325 126 N (0.6123077 0.3876923)
## 490) WordCount.log< 6.912741 231 84 N (0.6363636 0.3636364)
## 980) WordCount.log>=6.892134 36 8 N (0.7777778 0.2222222) *
## 981) WordCount.log< 6.892134 195 76 N (0.6102564 0.3897436)
## 1962) WordCount.log< 6.884486 185 70 N (0.6216216 0.3783784)
## 3924) WordCount.log>=6.790659 164 59 N (0.6402439 0.3597561)
## 7848) WordCount.log>=6.873163 18 4 N (0.7777778 0.2222222) *
## 7849) WordCount.log< 6.873163 146 55 N (0.6232877 0.3767123)
## 15698) WordCount.log< 6.862757 130 45 N (0.6538462 0.3461538)
## 31396) WordCount.log>=6.843217 32 8 N (0.7500000 0.2500000) *
## 31397) WordCount.log< 6.843217 98 37 N (0.6224490 0.3775510)
## 62794) WordCount.log< 6.836796 83 28 N (0.6626506 0.3373494)
## 125588) WordCount.log>=6.829253 11 2 N (0.8181818 0.1818182) *
## 125589) WordCount.log< 6.829253 72 26 N (0.6388889 0.3611111)
## 251178) WordCount.log< 6.807382 29 8 N (0.7241379 0.2758621) *
## 251179) WordCount.log>=6.807382 43 18 N (0.5813953 0.4186047)
## 502358) WordCount.log>=6.810693 35 12 N (0.6571429 0.3428571) *
## 502359) WordCount.log< 6.810693 8 2 Y (0.2500000 0.7500000) *
## 62795) WordCount.log>=6.836796 15 6 Y (0.4000000 0.6000000) *
## 15699) WordCount.log>=6.862757 16 6 Y (0.3750000 0.6250000) *
## 3925) WordCount.log< 6.790659 21 10 Y (0.4761905 0.5238095) *
## 1963) WordCount.log>=6.884486 10 4 Y (0.4000000 0.6000000) *
## 491) WordCount.log>=6.912741 94 42 N (0.5531915 0.4468085)
## 982) WordCount.log>=6.935857 63 25 N (0.6031746 0.3968254)
## 1964) WordCount.log< 6.942156 10 1 N (0.9000000 0.1000000) *
## 1965) WordCount.log>=6.942156 53 24 N (0.5471698 0.4528302)
## 3930) WordCount.log< 6.956069 21 8 N (0.6190476 0.3809524) *
## 3931) WordCount.log>=6.956069 32 16 N (0.5000000 0.5000000)
## 7862) WordCount.log>=6.96319 23 9 N (0.6086957 0.3913043) *
## 7863) WordCount.log< 6.96319 9 2 Y (0.2222222 0.7777778) *
## 983) WordCount.log< 6.935857 31 14 Y (0.4516129 0.5483871)
## 1966) WordCount.log< 6.928048 24 12 N (0.5000000 0.5000000)
## 3932) WordCount.log>=6.920178 9 3 N (0.6666667 0.3333333) *
## 3933) WordCount.log< 6.920178 15 6 Y (0.4000000 0.6000000) *
## 1967) WordCount.log>=6.928048 7 2 Y (0.2857143 0.7142857) *
## 123) WordCount.log>=7.162785 208 88 N (0.5769231 0.4230769)
## 246) WordCount.log>=7.17434 199 81 N (0.5929648 0.4070352)
## 492) WordCount.log>=7.275172 114 42 N (0.6315789 0.3684211)
## 984) WordCount.log< 7.345687 35 9 N (0.7428571 0.2571429) *
## 985) WordCount.log>=7.345687 79 33 N (0.5822785 0.4177215)
## 1970) WordCount.log< 7.543801 67 27 N (0.5970149 0.4029851)
## 3940) WordCount.log>=7.506042 8 2 N (0.7500000 0.2500000) *
## 3941) WordCount.log< 7.506042 59 25 N (0.5762712 0.4237288)
## 7882) WordCount.log< 7.405491 27 10 N (0.6296296 0.3703704) *
## 7883) WordCount.log>=7.405491 32 15 N (0.5312500 0.4687500)
## 15766) WordCount.log>=7.444526 16 6 N (0.6250000 0.3750000) *
## 15767) WordCount.log< 7.444526 16 7 Y (0.4375000 0.5625000) *
## 1971) WordCount.log>=7.543801 12 6 N (0.5000000 0.5000000) *
## 493) WordCount.log< 7.275172 85 39 N (0.5411765 0.4588235)
## 986) WordCount.log< 7.257355 69 29 N (0.5797101 0.4202899)
## 1972) WordCount.log< 7.191805 14 4 N (0.7142857 0.2857143) *
## 1973) WordCount.log>=7.191805 55 25 N (0.5454545 0.4545455)
## 3946) WordCount.log>=7.20897 40 16 N (0.6000000 0.4000000)
## 7892) WordCount.log< 7.220371 7 1 N (0.8571429 0.1428571) *
## 7893) WordCount.log>=7.220371 33 15 N (0.5454545 0.4545455)
## 15786) WordCount.log>=7.238497 15 5 N (0.6666667 0.3333333) *
## 15787) WordCount.log< 7.238497 18 8 Y (0.4444444 0.5555556) *
## 3947) WordCount.log< 7.20897 15 6 Y (0.4000000 0.6000000) *
## 987) WordCount.log>=7.257355 16 6 Y (0.3750000 0.6250000) *
## 247) WordCount.log< 7.17434 9 2 Y (0.2222222 0.7777778) *
## 31) WordCount.log< 6.775937 187 81 N (0.5668449 0.4331551)
## 62) WordCount.log< 6.771362 177 73 N (0.5875706 0.4124294)
## 124) WordCount.log< 6.736373 125 48 N (0.6160000 0.3840000)
## 248) WordCount.log>=6.713563 40 11 N (0.7250000 0.2750000) *
## 249) WordCount.log< 6.713563 85 37 N (0.5647059 0.4352941)
## 498) WordCount.log< 6.698884 56 21 N (0.6250000 0.3750000)
## 996) WordCount.log>=6.691463 12 2 N (0.8333333 0.1666667) *
## 997) WordCount.log< 6.691463 44 19 N (0.5681818 0.4318182)
## 1994) WordCount.log< 6.685236 35 12 N (0.6571429 0.3428571) *
## 1995) WordCount.log>=6.685236 9 2 Y (0.2222222 0.7777778) *
## 499) WordCount.log>=6.698884 29 13 Y (0.4482759 0.5517241) *
## 125) WordCount.log>=6.736373 52 25 N (0.5192308 0.4807692)
## 250) WordCount.log>=6.745823 40 16 N (0.6000000 0.4000000) *
## 251) WordCount.log< 6.745823 12 3 Y (0.2500000 0.7500000) *
## 63) WordCount.log>=6.771362 10 2 Y (0.2000000 0.8000000) *
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.2867534
## 2 0.1 0.2890821
## 3 0.2 0.4572127
## 4 0.3 0.4481999
## 5 0.4 0.3744208
## 6 0.5 0.3744208
## 7 0.6 0.2414929
## 8 0.7 0.1339829
## 9 0.8 0.0000000
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.2000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.cv.0.cp.0.rpart.N
## 1 N 3213
## 2 Y 375
## Popular.fctr.predict.Max.cor.Y.cv.0.cp.0.rpart.Y
## 1 513
## 2 374
## Prediction
## Reference N Y
## N 3213 513
## Y 375 374
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.015642e-01 3.368571e-01 7.895723e-01 8.131613e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 4.277569e-06
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.28654727
## 2 0.1 0.28307434
## 3 0.2 0.37994723
## 4 0.3 0.32692308
## 5 0.4 0.19793814
## 6 0.5 0.19793814
## 7 0.6 0.13365155
## 8 0.7 0.07894737
## 9 0.8 0.00000000
## 10 0.9 0.00000000
## 11 1.0 0.00000000
## [1] "Classifier Probability Threshold: 0.2000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.cv.0.cp.0.rpart.N
## 1 N 1443
## 2 Y 200
## Popular.fctr.predict.Max.cor.Y.cv.0.cp.0.rpart.Y
## 1 270
## 2 144
## Prediction
## Reference N Y
## N 1443 270
## Y 200 144
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.771511911 0.241360856 0.752744918 0.789501902 0.832766164
## AccuracyPValue McnemarPValue
## 1.000000000 0.001458922
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.cv.0.cp.0.rpart rpart WordCount.log 0
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 0.591 0.058 0.7074274
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4572127 0.8015642
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.7895723 0.8131613 0.3368571 0.6504263
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.3799472 0.7715119
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7527449 0.7895019 0.2413609
if (glb_is_regression || glb_is_binomial) # For multinomials this model will be run next by default
ret_lst <- myfit_mdl(model_id="Max.cor.Y",
model_method="rpart",
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_var,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitent_df, OOB_df=glb_OOBent_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
## [1] "fitting model: Max.cor.Y.rpart"
## [1] " indep_vars: WordCount.log"
## + Fold1: cp=0.001335
## - Fold1: cp=0.001335
## + Fold2: cp=0.001335
## - Fold2: cp=0.001335
## + Fold3: cp=0.001335
## - Fold3: cp=0.001335
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.00223 on full training set
## Warning in myfit_mdl(model_id = "Max.cor.Y", model_method = "rpart",
## model_type = glb_model_type, : model's bestTune found at an extreme of
## tuneGrid for parameter: cp
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 4475
##
## CP nsplit rel error
## 1 0.002225189 0 1
##
## Node number 1: 4475 observations
## predicted class=N expected loss=0.1673743 P(node) =1
## class counts: 3726 749
## probabilities: 0.833 0.167
##
## n= 4475
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4475 749 N (0.8326257 0.1673743) *
## [1] " calling mypredict_mdl for fit:"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.rpart.N
## 1 N 3726
## 2 Y 749
## Prediction
## Reference N Y
## N 3726 0
## Y 749 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.326257e-01 0.000000e+00 8.213602e-01 8.434553e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 5.097571e-01 1.800616e-164
## [1] " calling mypredict_mdl for OOB:"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.rpart.N
## 1 N 1713
## 2 Y 344
## Prediction
## Reference N Y
## N 1713 0
## Y 344 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.327662e-01 0.000000e+00 8.159247e-01 8.486533e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 5.143944e-01 2.337097e-76
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.rpart rpart WordCount.log 3
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 1.288 0.066 0.5
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0 0.8174308
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.8213602 0.8434553 0.06210715 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0 0.8327662
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.8159247 0.8486533 0
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.002468564 0.01888548
# Used to compare vs. Interactions.High.cor.Y
ret_lst <- myfit_mdl(model_id="Max.cor.Y",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_var,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitent_df, OOB_df=glb_OOBent_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
## [1] "fitting model: Max.cor.Y.glm"
## [1] " indep_vars: WordCount.log"
## + Fold1: parameter=none
## - Fold1: parameter=none
## + Fold2: parameter=none
## - Fold2: parameter=none
## + Fold3: parameter=none
## - Fold3: parameter=none
## Aggregating results
## Fitting final model on full training set
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4353 -0.6490 -0.4807 -0.2734 3.2543
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.02059 0.33309 -21.08 <2e-16 ***
## WordCount.log 0.88928 0.05231 17.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4042.7 on 4474 degrees of freedom
## Residual deviance: 3670.9 on 4473 degrees of freedom
## AIC: 3674.9
##
## Number of Fisher Scoring iterations: 5
##
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.286753446
## 2 0.1 0.353155973
## 3 0.2 0.422222222
## 4 0.3 0.287937743
## 5 0.4 0.080367394
## 6 0.5 0.023225806
## 7 0.6 0.013227513
## 8 0.7 0.005326232
## 9 0.8 0.000000000
## 10 0.9 0.000000000
## 11 1.0 0.000000000
## [1] "Classifier Probability Threshold: 0.2000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.glm.N
## 1 N 2700
## 2 Y 274
## Popular.fctr.predict.Max.cor.Y.glm.Y
## 1 1026
## 2 475
## Prediction
## Reference N Y
## N 2700 1026
## Y 274 475
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.094972e-01 2.560981e-01 6.959502e-01 7.227703e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 2.363819e-96
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.286547272
## 2 0.1 0.352486188
## 3 0.2 0.396172249
## 4 0.3 0.333868379
## 5 0.4 0.118483412
## 6 0.5 0.016713092
## 7 0.6 0.011527378
## 8 0.7 0.005797101
## 9 0.8 0.000000000
## 10 0.9 0.000000000
## 11 1.0 0.000000000
## [1] "Classifier Probability Threshold: 0.2000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Max.cor.Y.glm.N
## 1 N 1219
## 2 Y 137
## Popular.fctr.predict.Max.cor.Y.glm.Y
## 1 494
## 2 207
## Prediction
## Reference N Y
## N 1219 494
## Y 137 207
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.932426e-01 2.215049e-01 6.728061e-01 7.131270e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.362906e-45
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.glm glm WordCount.log 1
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 1.161 0.075 0.7301738
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4222222 0.8306149
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.6959502 0.7227703 0.01169498 0.7342331
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.3961722 0.6932426
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB min.aic.fit
## 1 0.6728061 0.713127 0.2215049 3674.923
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.0016274 0.007870559
# Interactions.High.cor.Y
if (length(int_feats <- setdiff(unique(glb_feats_df$cor.high.X), NA)) > 0) {
# lm & glm handle interaction terms; rpart & rf do not
if (glb_is_regression || glb_is_binomial) {
indep_vars_vctr <-
c(max_cor_y_x_var, paste(max_cor_y_x_var, int_feats, sep=":"))
} else { indep_vars_vctr <- union(max_cor_y_x_var, int_feats) }
ret_lst <- myfit_mdl(model_id="Interact.High.cor.Y",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
model_type=glb_model_type,
indep_vars_vctr,
glb_rsp_var, glb_rsp_var_out,
fit_df=glb_fitent_df, OOB_df=glb_OOBent_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
}
## [1] "fitting model: Interact.High.cor.Y.glm"
## [1] " indep_vars: WordCount.log, WordCount.log:PubDate.apm.fctr, WordCount.log:S.can, WordCount.log:S.make, WordCount.log:S.presid, WordCount.log:S.take, WordCount.log:S.new, WordCount.log:S.day, WordCount.log:S.show, WordCount.log:S.report, WordCount.log:S.share, WordCount.log:S.year, WordCount.log:S.compani, WordCount.log:S.first, WordCount.log:S.time, WordCount.log:S.articl, WordCount.log:S.will, WordCount.log:S.newyork, WordCount.log:S.intern, WordCount.log:H.week, WordCount.log:S.week, WordCount.log:S.fashion, WordCount.log:SectionName.nb.fctr, WordCount.log:H.num.chars.log, WordCount.log:NewsDesk.nb.fctr, WordCount.log:A.num.chars.log, WordCount.log:A.num.words.log, WordCount.log:S.num.chars.log"
## + Fold1: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1: parameter=none
## + Fold2: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2: parameter=none
## + Fold3: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3: parameter=none
## Aggregating results
## Fitting final model on full training set
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: not plotting observations with leverage one:
## 297
## Warning: not plotting observations with leverage one:
## 297
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4666 -0.4327 -0.2394 -0.0824 3.2847
##
## Coefficients: (19 not defined because of singularities)
## Estimate
## (Intercept) -5.927e+00
## WordCount.log 1.747e+00
## `WordCount.log:PubDate.apm.fctrpm` -3.907e-02
## `WordCount.log:S.can` -1.073e-01
## `WordCount.log:S.make` -1.186e-02
## `WordCount.log:S.presid` 2.584e-02
## `WordCount.log:S.take` -3.599e-02
## `WordCount.log:S.new` -4.365e-02
## `WordCount.log:S.day` 1.594e-02
## `WordCount.log:S.show` -6.894e-02
## `WordCount.log:S.report` -8.414e-02
## `WordCount.log:S.share` -8.654e-02
## `WordCount.log:S.year` -4.215e-02
## `WordCount.log:S.compani` -3.663e-02
## `WordCount.log:S.first` -2.477e-02
## `WordCount.log:S.time` -5.711e-02
## `WordCount.log:S.articl` 6.548e-03
## `WordCount.log:S.will` -8.270e-02
## `WordCount.log:S.newyork` -5.133e-02
## `WordCount.log:S.intern` -8.580e-02
## `WordCount.log:H.week` -1.140e-01
## `WordCount.log:S.week` -3.052e-02
## `WordCount.log:S.fashion` -2.030e-01
## `WordCount.log:SectionName.nb.fctrArts` -5.103e-01
## `WordCount.log:SectionName.nb.fctrBusiness Day` -5.456e-01
## `WordCount.log:SectionName.nb.fctrHealth` -5.285e-02
## `WordCount.log:SectionName.nb.fctrOpinion` -1.639e-01
## `WordCount.log:SectionName.nb.fctrWorld` -8.997e-01
## `WordCount.log:SectionName.nb.fctrStyles` 4.997e+12
## `WordCount.log:SectionName.nb.fctrTStyle` -9.630e-01
## `WordCount.log:SectionName.nb.fctrTechnology` -4.047e-01
## `WordCount.log:SectionName.nb.fctrMagazine` -9.743e+00
## `WordCount.log:SectionName.nb.fctrMultimedia` -8.622e-01
## `WordCount.log:SectionName.nb.fctrmyMisc::` -4.583e-01
## `WordCount.log:SectionName.nb.fctrTravel` -8.343e-01
## `WordCount.log:SectionName.nb.fctrU.S.` 4.997e+12
## `WordCount.log:SectionName.nb.fctrN.Y. / Region` -5.762e-01
## `WordCount.log:SectionName.nb.fctrDaily Clip Report::` -4.822e+00
## `WordCount.log:SectionName.nb.fctrOpen` -4.042e+00
## `WordCount.log:SectionName.nb.fctrReaders Respond::` -1.595e-01
## `WordCount.log:SectionName.nb.fctrSports` -6.113e+14
## `WordCount.log:SectionName.nb.fctrNational` 4.997e+12
## `WordCount.log:SectionName.nb.fctrVerbatim::` -7.130e+00
## `WordCount.log:SectionName.nb.fctrFirst Draft::` -4.685e+01
## `WordCount.log:SectionName.nb.fctrToday in Politics::` -1.452e+00
## `WordCount.log:SectionName.nb.fctrReporter's Notebook::` -4.329e-01
## `WordCount.log:SectionName.nb.fctrCulture` NA
## `WordCount.log:SectionName.nb.fctrThe Daily Gift::` NA
## `WordCount.log:H.num.chars.log` -3.488e-03
## `WordCount.log:NewsDesk.nb.fctrCulture` NA
## `WordCount.log:NewsDesk.nb.fctrScience` NA
## `WordCount.log:NewsDesk.nb.fctrOpEd` NA
## `WordCount.log:NewsDesk.nb.fctrForeign` NA
## `WordCount.log:NewsDesk.nb.fctrStyles` -4.997e+12
## `WordCount.log:NewsDesk.nb.fctrTStyle` 2.143e-01
## `WordCount.log:NewsDesk.nb.fctrMagazine` NA
## `WordCount.log:NewsDesk.nb.fctrmyMultimedia` NA
## `WordCount.log:NewsDesk.nb.fctrmyMisc::` NA
## `WordCount.log:NewsDesk.nb.fctrTravel` NA
## `WordCount.log:NewsDesk.nb.fctrmyEducation` -4.997e+12
## `WordCount.log:NewsDesk.nb.fctrMetro` NA
## `WordCount.log:NewsDesk.nb.fctrDaily Clip Report::` NA
## `WordCount.log:NewsDesk.nb.fctrReaders Respond::` NA
## `WordCount.log:NewsDesk.nb.fctrNational` -4.997e+12
## `WordCount.log:NewsDesk.nb.fctrSports` NA
## `WordCount.log:NewsDesk.nb.fctrVerbatim::` NA
## `WordCount.log:NewsDesk.nb.fctrFirst Draft::` NA
## `WordCount.log:NewsDesk.nb.fctrToday in Politics::` NA
## `WordCount.log:NewsDesk.nb.fctrReporter's Notebook::` NA
## `WordCount.log:NewsDesk.nb.fctrThe Daily Gift::` NA
## `WordCount.log:A.num.chars.log` -1.348e+00
## `WordCount.log:A.num.words.log` -5.482e-02
## `WordCount.log:S.num.chars.log` 1.263e+00
## Std. Error
## (Intercept) 4.242e-01
## WordCount.log 1.965e-01
## `WordCount.log:PubDate.apm.fctrpm` 1.855e-02
## `WordCount.log:S.can` 4.125e-02
## `WordCount.log:S.make` 4.031e-02
## `WordCount.log:S.presid` 4.402e-02
## `WordCount.log:S.take` 5.401e-02
## `WordCount.log:S.new` 3.026e-02
## `WordCount.log:S.day` 5.204e-02
## `WordCount.log:S.show` 5.763e-02
## `WordCount.log:S.report` 5.072e-02
## `WordCount.log:S.share` 6.287e-02
## `WordCount.log:S.year` 4.365e-02
## `WordCount.log:S.compani` 3.983e-02
## `WordCount.log:S.first` 6.239e-02
## `WordCount.log:S.time` 4.002e-02
## `WordCount.log:S.articl` 9.861e-02
## `WordCount.log:S.will` 3.313e-02
## `WordCount.log:S.newyork` 5.282e-02
## `WordCount.log:S.intern` 1.110e-01
## `WordCount.log:H.week` 9.044e-02
## `WordCount.log:S.week` 4.884e-02
## `WordCount.log:S.fashion` 1.805e-01
## `WordCount.log:SectionName.nb.fctrArts` 6.486e-02
## `WordCount.log:SectionName.nb.fctrBusiness Day` 6.663e-02
## `WordCount.log:SectionName.nb.fctrHealth` 6.616e-02
## `WordCount.log:SectionName.nb.fctrOpinion` 5.812e-02
## `WordCount.log:SectionName.nb.fctrWorld` 1.286e-01
## `WordCount.log:SectionName.nb.fctrStyles` 1.930e+13
## `WordCount.log:SectionName.nb.fctrTStyle` 2.617e-01
## `WordCount.log:SectionName.nb.fctrTechnology` 7.246e-02
## `WordCount.log:SectionName.nb.fctrMagazine` 1.919e+03
## `WordCount.log:SectionName.nb.fctrMultimedia` 1.715e-01
## `WordCount.log:SectionName.nb.fctrmyMisc::` 6.023e-02
## `WordCount.log:SectionName.nb.fctrTravel` 1.820e-01
## `WordCount.log:SectionName.nb.fctrU.S.` 1.930e+13
## `WordCount.log:SectionName.nb.fctrN.Y. / Region` 7.514e-02
## `WordCount.log:SectionName.nb.fctrDaily Clip Report::` 7.339e+03
## `WordCount.log:SectionName.nb.fctrOpen` 3.718e+04
## `WordCount.log:SectionName.nb.fctrReaders Respond::` 1.150e-01
## `WordCount.log:SectionName.nb.fctrSports` 9.109e+06
## `WordCount.log:SectionName.nb.fctrNational` 1.930e+13
## `WordCount.log:SectionName.nb.fctrVerbatim::` 1.922e+04
## `WordCount.log:SectionName.nb.fctrFirst Draft::` 2.172e+06
## `WordCount.log:SectionName.nb.fctrToday in Politics::` 7.923e-01
## `WordCount.log:SectionName.nb.fctrReporter's Notebook::` 1.859e-01
## `WordCount.log:SectionName.nb.fctrCulture` NA
## `WordCount.log:SectionName.nb.fctrThe Daily Gift::` NA
## `WordCount.log:H.num.chars.log` 2.526e-02
## `WordCount.log:NewsDesk.nb.fctrCulture` NA
## `WordCount.log:NewsDesk.nb.fctrScience` NA
## `WordCount.log:NewsDesk.nb.fctrOpEd` NA
## `WordCount.log:NewsDesk.nb.fctrForeign` NA
## `WordCount.log:NewsDesk.nb.fctrStyles` 1.930e+13
## `WordCount.log:NewsDesk.nb.fctrTStyle` 2.471e-01
## `WordCount.log:NewsDesk.nb.fctrMagazine` NA
## `WordCount.log:NewsDesk.nb.fctrmyMultimedia` NA
## `WordCount.log:NewsDesk.nb.fctrmyMisc::` NA
## `WordCount.log:NewsDesk.nb.fctrTravel` NA
## `WordCount.log:NewsDesk.nb.fctrmyEducation` 1.930e+13
## `WordCount.log:NewsDesk.nb.fctrMetro` NA
## `WordCount.log:NewsDesk.nb.fctrDaily Clip Report::` NA
## `WordCount.log:NewsDesk.nb.fctrReaders Respond::` NA
## `WordCount.log:NewsDesk.nb.fctrNational` 1.930e+13
## `WordCount.log:NewsDesk.nb.fctrSports` NA
## `WordCount.log:NewsDesk.nb.fctrVerbatim::` NA
## `WordCount.log:NewsDesk.nb.fctrFirst Draft::` NA
## `WordCount.log:NewsDesk.nb.fctrToday in Politics::` NA
## `WordCount.log:NewsDesk.nb.fctrReporter's Notebook::` NA
## `WordCount.log:NewsDesk.nb.fctrThe Daily Gift::` NA
## `WordCount.log:A.num.chars.log` 9.410e-01
## `WordCount.log:A.num.words.log` 7.366e-02
## `WordCount.log:S.num.chars.log` 9.402e-01
## z value
## (Intercept) -1.397e+01
## WordCount.log 8.890e+00
## `WordCount.log:PubDate.apm.fctrpm` -2.107e+00
## `WordCount.log:S.can` -2.602e+00
## `WordCount.log:S.make` -2.940e-01
## `WordCount.log:S.presid` 5.870e-01
## `WordCount.log:S.take` -6.660e-01
## `WordCount.log:S.new` -1.442e+00
## `WordCount.log:S.day` 3.060e-01
## `WordCount.log:S.show` -1.196e+00
## `WordCount.log:S.report` -1.659e+00
## `WordCount.log:S.share` -1.377e+00
## `WordCount.log:S.year` -9.660e-01
## `WordCount.log:S.compani` -9.200e-01
## `WordCount.log:S.first` -3.970e-01
## `WordCount.log:S.time` -1.427e+00
## `WordCount.log:S.articl` 6.600e-02
## `WordCount.log:S.will` -2.496e+00
## `WordCount.log:S.newyork` -9.720e-01
## `WordCount.log:S.intern` -7.730e-01
## `WordCount.log:H.week` -1.260e+00
## `WordCount.log:S.week` -6.250e-01
## `WordCount.log:S.fashion` -1.124e+00
## `WordCount.log:SectionName.nb.fctrArts` -7.868e+00
## `WordCount.log:SectionName.nb.fctrBusiness Day` -8.188e+00
## `WordCount.log:SectionName.nb.fctrHealth` -7.990e-01
## `WordCount.log:SectionName.nb.fctrOpinion` -2.820e+00
## `WordCount.log:SectionName.nb.fctrWorld` -6.997e+00
## `WordCount.log:SectionName.nb.fctrStyles` 2.590e-01
## `WordCount.log:SectionName.nb.fctrTStyle` -3.680e+00
## `WordCount.log:SectionName.nb.fctrTechnology` -5.585e+00
## `WordCount.log:SectionName.nb.fctrMagazine` -5.000e-03
## `WordCount.log:SectionName.nb.fctrMultimedia` -5.028e+00
## `WordCount.log:SectionName.nb.fctrmyMisc::` -7.609e+00
## `WordCount.log:SectionName.nb.fctrTravel` -4.584e+00
## `WordCount.log:SectionName.nb.fctrU.S.` 2.590e-01
## `WordCount.log:SectionName.nb.fctrN.Y. / Region` -7.669e+00
## `WordCount.log:SectionName.nb.fctrDaily Clip Report::` -1.000e-03
## `WordCount.log:SectionName.nb.fctrOpen` 0.000e+00
## `WordCount.log:SectionName.nb.fctrReaders Respond::` -1.387e+00
## `WordCount.log:SectionName.nb.fctrSports` -6.711e+07
## `WordCount.log:SectionName.nb.fctrNational` 2.590e-01
## `WordCount.log:SectionName.nb.fctrVerbatim::` 0.000e+00
## `WordCount.log:SectionName.nb.fctrFirst Draft::` 0.000e+00
## `WordCount.log:SectionName.nb.fctrToday in Politics::` -1.833e+00
## `WordCount.log:SectionName.nb.fctrReporter's Notebook::` -2.328e+00
## `WordCount.log:SectionName.nb.fctrCulture` NA
## `WordCount.log:SectionName.nb.fctrThe Daily Gift::` NA
## `WordCount.log:H.num.chars.log` -1.380e-01
## `WordCount.log:NewsDesk.nb.fctrCulture` NA
## `WordCount.log:NewsDesk.nb.fctrScience` NA
## `WordCount.log:NewsDesk.nb.fctrOpEd` NA
## `WordCount.log:NewsDesk.nb.fctrForeign` NA
## `WordCount.log:NewsDesk.nb.fctrStyles` -2.590e-01
## `WordCount.log:NewsDesk.nb.fctrTStyle` 8.670e-01
## `WordCount.log:NewsDesk.nb.fctrMagazine` NA
## `WordCount.log:NewsDesk.nb.fctrmyMultimedia` NA
## `WordCount.log:NewsDesk.nb.fctrmyMisc::` NA
## `WordCount.log:NewsDesk.nb.fctrTravel` NA
## `WordCount.log:NewsDesk.nb.fctrmyEducation` -2.590e-01
## `WordCount.log:NewsDesk.nb.fctrMetro` NA
## `WordCount.log:NewsDesk.nb.fctrDaily Clip Report::` NA
## `WordCount.log:NewsDesk.nb.fctrReaders Respond::` NA
## `WordCount.log:NewsDesk.nb.fctrNational` -2.590e-01
## `WordCount.log:NewsDesk.nb.fctrSports` NA
## `WordCount.log:NewsDesk.nb.fctrVerbatim::` NA
## `WordCount.log:NewsDesk.nb.fctrFirst Draft::` NA
## `WordCount.log:NewsDesk.nb.fctrToday in Politics::` NA
## `WordCount.log:NewsDesk.nb.fctrReporter's Notebook::` NA
## `WordCount.log:NewsDesk.nb.fctrThe Daily Gift::` NA
## `WordCount.log:A.num.chars.log` -1.433e+00
## `WordCount.log:A.num.words.log` -7.440e-01
## `WordCount.log:S.num.chars.log` 1.343e+00
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## WordCount.log < 2e-16 ***
## `WordCount.log:PubDate.apm.fctrpm` 0.035125 *
## `WordCount.log:S.can` 0.009277 **
## `WordCount.log:S.make` 0.768595
## `WordCount.log:S.presid` 0.557220
## `WordCount.log:S.take` 0.505139
## `WordCount.log:S.new` 0.149207
## `WordCount.log:S.day` 0.759408
## `WordCount.log:S.show` 0.231603
## `WordCount.log:S.report` 0.097124 .
## `WordCount.log:S.share` 0.168655
## `WordCount.log:S.year` 0.334167
## `WordCount.log:S.compani` 0.357766
## `WordCount.log:S.first` 0.691359
## `WordCount.log:S.time` 0.153573
## `WordCount.log:S.articl` 0.947062
## `WordCount.log:S.will` 0.012559 *
## `WordCount.log:S.newyork` 0.331166
## `WordCount.log:S.intern` 0.439536
## `WordCount.log:H.week` 0.207616
## `WordCount.log:S.week` 0.532028
## `WordCount.log:S.fashion` 0.260811
## `WordCount.log:SectionName.nb.fctrArts` 3.62e-15 ***
## `WordCount.log:SectionName.nb.fctrBusiness Day` 2.66e-16 ***
## `WordCount.log:SectionName.nb.fctrHealth` 0.424328
## `WordCount.log:SectionName.nb.fctrOpinion` 0.004795 **
## `WordCount.log:SectionName.nb.fctrWorld` 2.62e-12 ***
## `WordCount.log:SectionName.nb.fctrStyles` 0.795725
## `WordCount.log:SectionName.nb.fctrTStyle` 0.000234 ***
## `WordCount.log:SectionName.nb.fctrTechnology` 2.33e-08 ***
## `WordCount.log:SectionName.nb.fctrMagazine` 0.995949
## `WordCount.log:SectionName.nb.fctrMultimedia` 4.96e-07 ***
## `WordCount.log:SectionName.nb.fctrmyMisc::` 2.76e-14 ***
## `WordCount.log:SectionName.nb.fctrTravel` 4.56e-06 ***
## `WordCount.log:SectionName.nb.fctrU.S.` 0.795725
## `WordCount.log:SectionName.nb.fctrN.Y. / Region` 1.74e-14 ***
## `WordCount.log:SectionName.nb.fctrDaily Clip Report::` 0.999476
## `WordCount.log:SectionName.nb.fctrOpen` 0.999913
## `WordCount.log:SectionName.nb.fctrReaders Respond::` 0.165462
## `WordCount.log:SectionName.nb.fctrSports` < 2e-16 ***
## `WordCount.log:SectionName.nb.fctrNational` 0.795725
## `WordCount.log:SectionName.nb.fctrVerbatim::` 0.999704
## `WordCount.log:SectionName.nb.fctrFirst Draft::` 0.999983
## `WordCount.log:SectionName.nb.fctrToday in Politics::` 0.066836 .
## `WordCount.log:SectionName.nb.fctrReporter's Notebook::` 0.019909 *
## `WordCount.log:SectionName.nb.fctrCulture` NA
## `WordCount.log:SectionName.nb.fctrThe Daily Gift::` NA
## `WordCount.log:H.num.chars.log` 0.890180
## `WordCount.log:NewsDesk.nb.fctrCulture` NA
## `WordCount.log:NewsDesk.nb.fctrScience` NA
## `WordCount.log:NewsDesk.nb.fctrOpEd` NA
## `WordCount.log:NewsDesk.nb.fctrForeign` NA
## `WordCount.log:NewsDesk.nb.fctrStyles` 0.795725
## `WordCount.log:NewsDesk.nb.fctrTStyle` 0.385706
## `WordCount.log:NewsDesk.nb.fctrMagazine` NA
## `WordCount.log:NewsDesk.nb.fctrmyMultimedia` NA
## `WordCount.log:NewsDesk.nb.fctrmyMisc::` NA
## `WordCount.log:NewsDesk.nb.fctrTravel` NA
## `WordCount.log:NewsDesk.nb.fctrmyEducation` 0.795725
## `WordCount.log:NewsDesk.nb.fctrMetro` NA
## `WordCount.log:NewsDesk.nb.fctrDaily Clip Report::` NA
## `WordCount.log:NewsDesk.nb.fctrReaders Respond::` NA
## `WordCount.log:NewsDesk.nb.fctrNational` 0.795725
## `WordCount.log:NewsDesk.nb.fctrSports` NA
## `WordCount.log:NewsDesk.nb.fctrVerbatim::` NA
## `WordCount.log:NewsDesk.nb.fctrFirst Draft::` NA
## `WordCount.log:NewsDesk.nb.fctrToday in Politics::` NA
## `WordCount.log:NewsDesk.nb.fctrReporter's Notebook::` NA
## `WordCount.log:NewsDesk.nb.fctrThe Daily Gift::` NA
## `WordCount.log:A.num.chars.log` 0.151970
## `WordCount.log:A.num.words.log` 0.456777
## `WordCount.log:S.num.chars.log` 0.179180
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4042.7 on 4474 degrees of freedom
## Residual deviance: 2396.6 on 4421 degrees of freedom
## AIC: 2504.6
##
## Number of Fisher Scoring iterations: 25
##
## [1] " calling mypredict_mdl for fit:"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## threshold f.score
## 1 0.0 0.28675345
## 2 0.1 0.56728778
## 3 0.2 0.67092281
## 4 0.3 0.68379447
## 5 0.4 0.66150598
## 6 0.5 0.62865716
## 7 0.6 0.57856567
## 8 0.7 0.50133333
## 9 0.8 0.35582822
## 10 0.9 0.08080808
## 11 1.0 0.00000000
## [1] "Classifier Probability Threshold: 0.3000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Interact.High.cor.Y.glm.N
## 1 N 3476
## 2 Y 230
## Popular.fctr.predict.Interact.High.cor.Y.glm.Y
## 1 250
## 2 519
## Prediction
## Reference N Y
## N 3476 250
## Y 230 519
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.927374e-01 6.192224e-01 8.833016e-01 9.016572e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 3.083680e-30 3.858174e-01
## [1] " calling mypredict_mdl for OOB:"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## threshold f.score
## 1 0.0 0.28654727
## 2 0.1 0.55536028
## 3 0.2 0.67082294
## 4 0.3 0.69153515
## 5 0.4 0.67867868
## 6 0.5 0.66244057
## 7 0.6 0.59829060
## 8 0.7 0.50763359
## 9 0.8 0.33860045
## 10 0.9 0.07162534
## 11 1.0 0.00000000
## [1] "Classifier Probability Threshold: 0.3000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Interact.High.cor.Y.glm.N
## 1 N 1601
## 2 Y 103
## Popular.fctr.predict.Interact.High.cor.Y.glm.Y
## 1 112
## 2 241
## Prediction
## Reference N Y
## N 1601 112
## Y 103 241
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.954789e-01 6.286271e-01 8.814466e-01 9.083722e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 4.319079e-16 5.853440e-01
## model_id model_method
## 1 Interact.High.cor.Y.glm glm
## feats
## 1 WordCount.log, WordCount.log:PubDate.apm.fctr, WordCount.log:S.can, WordCount.log:S.make, WordCount.log:S.presid, WordCount.log:S.take, WordCount.log:S.new, WordCount.log:S.day, WordCount.log:S.show, WordCount.log:S.report, WordCount.log:S.share, WordCount.log:S.year, WordCount.log:S.compani, WordCount.log:S.first, WordCount.log:S.time, WordCount.log:S.articl, WordCount.log:S.will, WordCount.log:S.newyork, WordCount.log:S.intern, WordCount.log:H.week, WordCount.log:S.week, WordCount.log:S.fashion, WordCount.log:SectionName.nb.fctr, WordCount.log:H.num.chars.log, WordCount.log:NewsDesk.nb.fctr, WordCount.log:A.num.chars.log, WordCount.log:A.num.words.log, WordCount.log:S.num.chars.log
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 6.445 1.862
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.9117549 0.3 0.6837945 0.726509
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.8833016 0.9016572 0.2991516 0.9127805
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.3 0.6915352 0.8954789
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB min.aic.fit
## 1 0.8814466 0.9083722 0.6286271 2504.643
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.253696 0.2647298
# Low.cor.X
if (glb_is_classification && glb_is_binomial)
indep_vars_vctr <- subset(glb_feats_df, is.na(cor.high.X) &
is.ConditionalX.y &
(exclude.as.feat != 1))[, "id"] else
indep_vars_vctr <- subset(glb_feats_df, is.na(cor.high.X) &
(exclude.as.feat != 1))[, "id"]
ret_lst <- myfit_mdl(model_id="Low.cor.X",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
indep_vars_vctr=indep_vars_vctr,
model_type=glb_model_type,
glb_rsp_var, glb_rsp_var_out,
fit_df=glb_fitent_df, OOB_df=glb_OOBent_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
## [1] "fitting model: Low.cor.X.glm"
## [1] " indep_vars: WordCount.log, H.is.question, PubDate.apm.fctr, S.can, H.has.ebola, S.make, S.one, S.state, A.state, A.one, S.said, A.said, .rnorm, PubDate.date.fctr, PubDate.second, S.presid, S.take, PubDate.minute, S.new, PubDate.wkday.fctr, S.day, H.X2014, S.show, S.report, S.share, S.year, S.compani, H.new, S.first, S.time, H.newyork, S.articl, S.will, H.day, S.newyork, H.today, H.report, S.intern, H.week, S.week, S.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, H.num.words.log, A.num.chars.log, A.num.words.log, A.num.words.unq.log"
## + Fold1: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1: parameter=none
## + Fold2: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2: parameter=none
## + Fold3: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3: parameter=none
## Aggregating results
## Fitting final model on full training set
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: not plotting observations with leverage one:
## 297
## Warning: not plotting observations with leverage one:
## 297
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.84925 -0.40284 -0.19920 -0.00007 3.06190
##
## Coefficients: (9 not defined because of singularities)
## Estimate Std. Error
## (Intercept) 1.160e+00 1.134e+00
## WordCount.log 8.608e-01 7.533e-02
## H.is.question 6.068e-01 1.893e-01
## PubDate.apm.fctrpm -1.509e-01 1.203e-01
## S.can -6.972e-01 2.702e-01
## H.has.ebola -5.793e-01 4.482e-01
## S.make -1.297e-01 2.647e-01
## S.one 1.473e+01 6.869e+03
## S.state 7.304e-03 3.041e-01
## A.state NA NA
## A.one -1.469e+01 6.869e+03
## S.said 5.681e-01 2.548e-01
## A.said NA NA
## .rnorm -2.403e-02 5.438e-02
## `PubDate.date.fctr(7,13]` 5.325e-02 1.696e-01
## `PubDate.date.fctr(13,19]` -7.337e-02 1.685e-01
## `PubDate.date.fctr(19,25]` -2.037e-01 1.668e-01
## `PubDate.date.fctr(25,31]` 4.079e-02 1.787e-01
## PubDate.second 1.448e-03 3.173e-03
## S.presid 1.425e-01 2.787e-01
## S.take -6.040e-02 3.405e-01
## PubDate.minute 4.814e-03 3.120e-03
## S.new -1.895e-01 1.973e-01
## PubDate.wkday.fctr1 -9.398e-01 2.525e-01
## PubDate.wkday.fctr2 -1.156e+00 2.575e-01
## PubDate.wkday.fctr3 -1.037e+00 2.545e-01
## PubDate.wkday.fctr4 -1.276e+00 2.571e-01
## PubDate.wkday.fctr5 -1.258e+00 2.602e-01
## PubDate.wkday.fctr6 -9.151e-01 4.023e-01
## S.day 1.148e-01 3.588e-01
## H.X2014 -1.769e+00 9.156e-01
## S.show -3.590e-01 3.709e-01
## S.report -6.298e-02 3.251e-01
## S.share -9.059e-01 4.757e-01
## S.year -3.281e-01 2.705e-01
## S.compani -2.970e-01 2.644e-01
## H.new -3.499e-01 3.878e-01
## S.first -8.464e-02 3.738e-01
## S.time -1.684e-01 2.472e-01
## H.newyork 1.303e-01 6.058e-01
## S.articl -8.789e-01 5.910e-01
## S.will -4.059e-01 2.046e-01
## H.day 1.566e-02 5.123e-01
## S.newyork 2.525e-02 3.264e-01
## H.today -1.678e+01 3.805e+03
## H.report -8.509e-01 6.795e-01
## S.intern -2.592e-01 7.017e-01
## H.week 3.097e-01 5.426e-01
## S.week -4.795e-01 3.461e-01
## S.fashion -5.049e-02 1.040e+00
## `Headline.pfx.fctr19[0-9][0-9]::` -1.417e+01 1.007e+03
## `Headline.pfx.fctrDaily Report::` -1.612e+01 1.660e+03
## `Headline.pfx.fctr.*Fashion Week::` -1.518e+01 9.090e+02
## `Headline.pfx.fctrWhat We're::` -1.994e+01 2.009e+03
## `Headline.pfx.fctrPictures of the (Day|Year|.)::` -1.214e+01 1.584e+03
## `Headline.pfx.fctrToday in Small Business::` -1.441e-01 4.141e+03
## `Headline.pfx.fctrDaily Clip Report::` -1.811e+01 1.617e+03
## `Headline.pfx.fctrMorning Agenda::` -1.741e+01 1.631e+03
## `Headline.pfx.fctrNew York Today::` 1.432e+01 3.805e+03
## `Headline.pfx.fctr6 Q's About the News::` -1.834e+01 1.684e+03
## `Headline.pfx.fctrTest Yourself::` -1.642e+01 1.644e+03
## `Headline.pfx.fctrWord of the Day::` -1.750e+01 1.633e+03
## `Headline.pfx.fctrmyTech::` 2.072e-01 3.542e-01
## `Headline.pfx.fctrYour Turn::` 4.419e+00 1.091e+00
## `Headline.pfx.fctrReaders Respond::` 6.528e-02 1.126e+00
## `Headline.pfx.fctrAsk Well::` 7.556e-02 9.930e-01
## `Headline.pfx.fctrQuiz(.*)([?=|]|[?=:]::` 1.543e+00 1.441e+00
## `Headline.pfx.fctrOn This Day::` -1.855e+01 3.547e+03
## `Headline.pfx.fctrVerbatim::` -1.876e+01 2.214e+03
## `Headline.pfx.fctrFirst Draft::` -2.004e+01 1.504e+03
## `Headline.pfx.fctrToday in Politics::` -4.898e+00 4.267e+03
## `Headline.pfx.fctrReporter's Notebook::` -1.673e+00 1.350e+00
## `Headline.pfx.fctrmyFood::` -1.526e+00 6.787e-01
## `Headline.pfx.fctrThe Daily Gift::` -1.418e+01 2.411e+03
## SectionName.nb.fctrArts -3.308e+00 3.975e-01
## `SectionName.nb.fctrBusiness Day` -3.401e+00 3.961e-01
## SectionName.nb.fctrHealth -3.276e-01 4.034e-01
## SectionName.nb.fctrOpinion -5.825e-01 3.537e-01
## SectionName.nb.fctrWorld -5.564e+00 8.122e-01
## SectionName.nb.fctrStyles -2.068e+01 1.487e+03
## SectionName.nb.fctrTStyle -4.583e+00 5.237e-01
## SectionName.nb.fctrTechnology -2.437e+00 4.277e-01
## SectionName.nb.fctrMagazine -2.037e+01 2.176e+03
## SectionName.nb.fctrMultimedia -5.285e+00 1.073e+00
## `SectionName.nb.fctrmyMisc::` -2.995e+00 3.696e-01
## SectionName.nb.fctrTravel -4.788e+00 1.063e+00
## SectionName.nb.fctrU.S. -1.621e+00 4.003e-01
## `SectionName.nb.fctrN.Y. / Region` -2.565e+00 5.003e-01
## `SectionName.nb.fctrDaily Clip Report::` NA NA
## SectionName.nb.fctrOpen -2.091e+01 7.546e+03
## `SectionName.nb.fctrReaders Respond::` -7.144e-01 1.347e+00
## SectionName.nb.fctrSports -1.996e+01 1.075e+04
## SectionName.nb.fctrNational -1.958e+01 7.571e+03
## `SectionName.nb.fctrVerbatim::` NA NA
## `SectionName.nb.fctrFirst Draft::` NA NA
## `SectionName.nb.fctrToday in Politics::` NA NA
## `SectionName.nb.fctrReporter's Notebook::` NA NA
## SectionName.nb.fctrCulture NA NA
## `SectionName.nb.fctrThe Daily Gift::` NA NA
## H.num.chars.log -7.265e-02 3.184e-01
## H.num.words.log -1.889e-01 3.731e-01
## A.num.chars.log -9.813e-01 4.279e-01
## A.num.words.log 1.981e+00 1.366e+00
## A.num.words.unq.log -1.600e+00 1.304e+00
## z value Pr(>|z|)
## (Intercept) 1.023 0.306301
## WordCount.log 11.427 < 2e-16 ***
## H.is.question 3.206 0.001347 **
## PubDate.apm.fctrpm -1.254 0.209892
## S.can -2.581 0.009865 **
## H.has.ebola -1.293 0.196156
## S.make -0.490 0.624306
## S.one 0.002 0.998289
## S.state 0.024 0.980836
## A.state NA NA
## A.one -0.002 0.998293
## S.said 2.230 0.025744 *
## A.said NA NA
## .rnorm -0.442 0.658596
## `PubDate.date.fctr(7,13]` 0.314 0.753479
## `PubDate.date.fctr(13,19]` -0.435 0.663263
## `PubDate.date.fctr(19,25]` -1.221 0.222054
## `PubDate.date.fctr(25,31]` 0.228 0.819433
## PubDate.second 0.456 0.648153
## S.presid 0.511 0.609055
## S.take -0.177 0.859209
## PubDate.minute 1.543 0.122849
## S.new -0.960 0.336807
## PubDate.wkday.fctr1 -3.723 0.000197 ***
## PubDate.wkday.fctr2 -4.488 7.20e-06 ***
## PubDate.wkday.fctr3 -4.073 4.63e-05 ***
## PubDate.wkday.fctr4 -4.965 6.89e-07 ***
## PubDate.wkday.fctr5 -4.836 1.33e-06 ***
## PubDate.wkday.fctr6 -2.275 0.022924 *
## S.day 0.320 0.748920
## H.X2014 -1.932 0.053320 .
## S.show -0.968 0.332997
## S.report -0.194 0.846370
## S.share -1.904 0.056889 .
## S.year -1.213 0.225082
## S.compani -1.123 0.261331
## H.new -0.902 0.366965
## S.first -0.226 0.820880
## S.time -0.681 0.495757
## H.newyork 0.215 0.829683
## S.articl -1.487 0.136979
## S.will -1.983 0.047315 *
## H.day 0.031 0.975607
## S.newyork 0.077 0.938342
## H.today -0.004 0.996481
## H.report -1.252 0.210479
## S.intern -0.369 0.711876
## H.week 0.571 0.568200
## S.week -1.386 0.165870
## S.fashion -0.049 0.961292
## `Headline.pfx.fctr19[0-9][0-9]::` -0.014 0.988775
## `Headline.pfx.fctrDaily Report::` -0.010 0.992252
## `Headline.pfx.fctr.*Fashion Week::` -0.017 0.986675
## `Headline.pfx.fctrWhat We're::` -0.010 0.992080
## `Headline.pfx.fctrPictures of the (Day|Year|.)::` -0.008 0.993883
## `Headline.pfx.fctrToday in Small Business::` 0.000 0.999972
## `Headline.pfx.fctrDaily Clip Report::` -0.011 0.991067
## `Headline.pfx.fctrMorning Agenda::` -0.011 0.991483
## `Headline.pfx.fctrNew York Today::` 0.004 0.996997
## `Headline.pfx.fctr6 Q's About the News::` -0.011 0.991311
## `Headline.pfx.fctrTest Yourself::` -0.010 0.992028
## `Headline.pfx.fctrWord of the Day::` -0.011 0.991447
## `Headline.pfx.fctrmyTech::` 0.585 0.558522
## `Headline.pfx.fctrYour Turn::` 4.050 5.13e-05 ***
## `Headline.pfx.fctrReaders Respond::` 0.058 0.953781
## `Headline.pfx.fctrAsk Well::` 0.076 0.939344
## `Headline.pfx.fctrQuiz(.*)([?=|]|[?=:]::` 1.071 0.284102
## `Headline.pfx.fctrOn This Day::` -0.005 0.995827
## `Headline.pfx.fctrVerbatim::` -0.008 0.993239
## `Headline.pfx.fctrFirst Draft::` -0.013 0.989372
## `Headline.pfx.fctrToday in Politics::` -0.001 0.999084
## `Headline.pfx.fctrReporter's Notebook::` -1.239 0.215215
## `Headline.pfx.fctrmyFood::` -2.249 0.024525 *
## `Headline.pfx.fctrThe Daily Gift::` -0.006 0.995306
## SectionName.nb.fctrArts -8.320 < 2e-16 ***
## `SectionName.nb.fctrBusiness Day` -8.587 < 2e-16 ***
## SectionName.nb.fctrHealth -0.812 0.416681
## SectionName.nb.fctrOpinion -1.647 0.099561 .
## SectionName.nb.fctrWorld -6.850 7.39e-12 ***
## SectionName.nb.fctrStyles -0.014 0.988903
## SectionName.nb.fctrTStyle -8.750 < 2e-16 ***
## SectionName.nb.fctrTechnology -5.697 1.22e-08 ***
## SectionName.nb.fctrMagazine -0.009 0.992532
## SectionName.nb.fctrMultimedia -4.924 8.49e-07 ***
## `SectionName.nb.fctrmyMisc::` -8.103 5.35e-16 ***
## SectionName.nb.fctrTravel -4.506 6.62e-06 ***
## SectionName.nb.fctrU.S. -4.050 5.13e-05 ***
## `SectionName.nb.fctrN.Y. / Region` -5.127 2.94e-07 ***
## `SectionName.nb.fctrDaily Clip Report::` NA NA
## SectionName.nb.fctrOpen -0.003 0.997789
## `SectionName.nb.fctrReaders Respond::` -0.531 0.595740
## SectionName.nb.fctrSports -0.002 0.998519
## SectionName.nb.fctrNational -0.003 0.997936
## `SectionName.nb.fctrVerbatim::` NA NA
## `SectionName.nb.fctrFirst Draft::` NA NA
## `SectionName.nb.fctrToday in Politics::` NA NA
## `SectionName.nb.fctrReporter's Notebook::` NA NA
## SectionName.nb.fctrCulture NA NA
## `SectionName.nb.fctrThe Daily Gift::` NA NA
## H.num.chars.log -0.228 0.819521
## H.num.words.log -0.506 0.612548
## A.num.chars.log -2.293 0.021835 *
## A.num.words.log 1.451 0.146810
## A.num.words.unq.log -1.227 0.219816
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4042.7 on 4474 degrees of freedom
## Residual deviance: 2253.3 on 4380 degrees of freedom
## AIC: 2443.3
##
## Number of Fisher Scoring iterations: 18
##
## [1] " calling mypredict_mdl for fit:"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## threshold f.score
## 1 0.0 0.2867534
## 2 0.1 0.6034188
## 3 0.2 0.6859688
## 4 0.3 0.6808777
## 5 0.4 0.6597938
## 6 0.5 0.6401180
## 7 0.6 0.5883306
## 8 0.7 0.5199645
## 9 0.8 0.3773966
## 10 0.9 0.1061728
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.2000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Low.cor.X.glm.N
## 1 N 3295
## 2 Y 133
## Popular.fctr.predict.Low.cor.X.glm.Y
## 1 431
## 2 616
## Prediction
## Reference N Y
## N 3295 431
## Y 133 616
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.739665e-01 6.098285e-01 8.638863e-01 8.835558e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 9.484585e-15 6.926150e-36
## [1] " calling mypredict_mdl for OOB:"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## threshold f.score
## 1 0.0 0.28654727
## 2 0.1 0.57372654
## 3 0.2 0.63615023
## 4 0.3 0.66756032
## 5 0.4 0.66569767
## 6 0.5 0.65620094
## 7 0.6 0.60034305
## 8 0.7 0.50095602
## 9 0.8 0.38611714
## 10 0.9 0.09703504
## 11 1.0 0.00000000
## [1] "Classifier Probability Threshold: 0.3000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Low.cor.X.glm.N
## 1 N 1560
## 2 Y 95
## Popular.fctr.predict.Low.cor.X.glm.Y
## 1 153
## 2 249
## Prediction
## Reference N Y
## N 1560 153
## Y 95 249
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.794361e-01 5.944691e-01 8.645784e-01 8.932025e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 2.108709e-09 2.951687e-04
## model_id model_method
## 1 Low.cor.X.glm glm
## feats
## 1 WordCount.log, H.is.question, PubDate.apm.fctr, S.can, H.has.ebola, S.make, S.one, S.state, A.state, A.one, S.said, A.said, .rnorm, PubDate.date.fctr, PubDate.second, S.presid, S.take, PubDate.minute, S.new, PubDate.wkday.fctr, S.day, H.X2014, S.show, S.report, S.share, S.year, S.compani, H.new, S.first, S.time, H.newyork, S.articl, S.will, H.day, S.newyork, H.today, H.report, S.intern, H.week, S.week, S.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, H.num.words.log, A.num.chars.log, A.num.words.log, A.num.words.unq.log
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 8.294 2.392
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.9282722 0.2 0.6859688 0.886706
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.8638863 0.8835558 0.5565211 0.9144538
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.3 0.6675603 0.8794361
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB min.aic.fit
## 1 0.8645784 0.8932025 0.5944691 2443.257
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.008518875 0.02984607
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 9 fit.models 6 0 166.467 214.381 47.914
## 10 fit.models 6 1 214.381 NA NA
# All X that is not user excluded
if (glb_is_classification && glb_is_binomial) {
model_id_pfx <- "Conditional.X"
# indep_vars_vctr <- setdiff(names(glb_fitent_df), union(glb_rsp_var, glb_exclude_vars_as_features))
indep_vars_vctr <- subset(glb_feats_df, is.ConditionalX.y &
(exclude.as.feat != 1))[, "id"]
} else {
model_id_pfx <- "All.X"
indep_vars_vctr <- subset(glb_feats_df,
(exclude.as.feat != 1))[, "id"]
}
for (method in glb_models_method_vctr) {
if (method %in% c("rpart", "rf")) {
# rpart: fubar's the tree
# rf: skip the scenario w/ .rnorm for speed
indep_vars_vctr <- setdiff(indep_vars_vctr, c(".rnorm"))
model_id <- paste0(model_id_pfx, ".no.rnorm")
} else model_id <- model_id_pfx
ret_lst <- myfit_mdl(model_id=model_id, model_method=method,
indep_vars_vctr=indep_vars_vctr,
model_type=glb_model_type,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitent_df, OOB_df=glb_OOBent_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=glb_tune_models_df)
# Since caret does not optimize rpart well
# if (method == "rpart")
# ret_lst <- myfit_mdl(model_id=paste0(model_id_pfx, ".cp.0"), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitent_df, OOB_df=glb_OOBent_df,
# n_cv_folds=0, tune_models_df=data.frame(parameter="cp", min=0.0, max=0.0, by=0.1))
}
## [1] "fitting model: Conditional.X.glm"
## [1] " indep_vars: WordCount.log, PubDate.hour, H.is.question, PubDate.apm.fctr, A.can, S.can, H.has.ebola, S.make, A.make, S.one, S.state, A.state, A.one, S.said, A.said, .rnorm, PubDate.date.fctr, PubDate.second, S.presid, A.presid, S.take, A.take, PubDate.minute, S.new, A.new, PubDate.wkday.fctr, S.day, A.day, H.X2014, S.show, A.show, S.report, A.report, S.share, A.share, S.year, A.year, S.compani, A.compani, H.new, S.first, A.first, S.time, A.time, H.newyork, S.articl, A.articl, S.will, A.will, H.day, S.newyork, A.newyork, H.today, H.report, S.intern, A.intern, H.week, H.fashion, S.week, A.week, S.fashion, A.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, NewsDesk.nb.fctr, H.num.words.log, H.num.words.unq.log, SubsectionName.nb.fctr, A.num.chars.log, S.num.chars.log, A.num.words.log, S.num.words.log, A.num.words.unq.log, S.num.words.unq.log"
## + Fold1: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1: parameter=none
## + Fold2: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2: parameter=none
## + Fold3: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3: parameter=none
## Aggregating results
## Fitting final model on full training set
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: not plotting observations with leverage one:
## 297, 1693, 3596, 3794, 4229, 4338, 4394
## Warning: not plotting observations with leverage one:
## 297, 1693, 3596, 3794, 4229, 4338, 4394
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.49 0.00 0.00 0.00 8.49
##
## Coefficients: (70 not defined because of singularities)
## Estimate
## (Intercept) -2.615e+15
## WordCount.log 6.729e+14
## PubDate.hour 4.846e+13
## H.is.question 4.518e+14
## PubDate.apm.fctrpm -3.244e+14
## A.can 6.628e+15
## S.can -6.839e+15
## H.has.ebola -3.844e+14
## S.make -1.432e+14
## A.make NA
## S.one -3.230e+15
## S.state 8.547e+13
## A.state NA
## A.one 3.244e+15
## S.said 4.697e+14
## A.said NA
## .rnorm 4.005e+13
## `PubDate.date.fctr(7,13]` -9.128e+13
## `PubDate.date.fctr(13,19]` -3.615e+13
## `PubDate.date.fctr(19,25]` -1.070e+14
## `PubDate.date.fctr(25,31]` -4.750e+11
## PubDate.second 7.223e+11
## S.presid 4.675e+13
## A.presid NA
## S.take -5.371e+15
## A.take 5.380e+15
## PubDate.minute -9.986e+11
## S.new -6.961e+13
## A.new 7.353e+13
## PubDate.wkday.fctr1 -1.187e+14
## PubDate.wkday.fctr2 -3.410e+14
## PubDate.wkday.fctr3 -1.302e+14
## PubDate.wkday.fctr4 -4.438e+14
## PubDate.wkday.fctr5 -2.933e+14
## PubDate.wkday.fctr6 2.339e+14
## S.day -4.936e+15
## A.day 5.005e+15
## H.X2014 -4.198e+13
## S.show -7.351e+13
## A.show NA
## S.report 6.629e+13
## A.report NA
## S.share -1.156e+15
## A.share NA
## S.year -3.518e+14
## A.year NA
## S.compani -3.880e+14
## A.compani NA
## H.new -3.145e+14
## S.first 1.350e+14
## A.first NA
## S.time 4.895e+15
## A.time -4.836e+15
## H.newyork -4.456e+14
## S.articl -1.954e+14
## A.articl NA
## S.will 4.571e+14
## A.will -4.853e+14
## H.day -3.552e+13
## S.newyork 7.286e+13
## A.newyork NA
## H.today -2.246e+15
## H.report -6.168e+14
## S.intern 7.086e+13
## A.intern NA
## H.week -1.980e+14
## H.fashion 5.197e+14
## S.week -8.716e+13
## A.week NA
## S.fashion 2.484e+14
## A.fashion NA
## `Headline.pfx.fctr19[0-9][0-9]::` 2.970e+15
## `Headline.pfx.fctrDaily Report::` 7.838e+13
## `Headline.pfx.fctr.*Fashion Week::` -1.971e+15
## `Headline.pfx.fctrWhat We're::` -5.425e+15
## `Headline.pfx.fctrPictures of the (Day|Year|.)::` 2.378e+14
## `Headline.pfx.fctrToday in Small Business::` -3.228e+14
## `Headline.pfx.fctrDaily Clip Report::` -2.831e+15
## `Headline.pfx.fctrMorning Agenda::` -3.491e+15
## `Headline.pfx.fctrNew York Today::` 1.175e+15
## `Headline.pfx.fctr6 Q's About the News::` 3.692e+15
## `Headline.pfx.fctrTest Yourself::` 1.940e+15
## `Headline.pfx.fctrWord of the Day::` 3.498e+15
## `Headline.pfx.fctrmyTech::` 2.820e+14
## `Headline.pfx.fctrYour Turn::` 3.026e+14
## `Headline.pfx.fctrReaders Respond::` 7.355e+14
## `Headline.pfx.fctrAsk Well::` -3.114e+14
## `Headline.pfx.fctrQuiz(.*)([?=|]|[?=:]::` 5.409e+15
## `Headline.pfx.fctrOn This Day::` 5.037e+14
## `Headline.pfx.fctrVerbatim::` -2.661e+15
## `Headline.pfx.fctrFirst Draft::` -3.057e+15
## `Headline.pfx.fctrToday in Politics::` -2.071e+14
## `Headline.pfx.fctrReporter's Notebook::` 3.617e+14
## `Headline.pfx.fctrmyFood::` -4.973e+14
## `Headline.pfx.fctrThe Daily Gift::` -6.489e+14
## SectionName.nb.fctrArts -1.024e+15
## `SectionName.nb.fctrBusiness Day` -1.159e+15
## SectionName.nb.fctrHealth 7.578e+14
## SectionName.nb.fctrOpinion 2.271e+15
## SectionName.nb.fctrWorld -2.578e+15
## SectionName.nb.fctrStyles -7.836e+15
## SectionName.nb.fctrTStyle -1.365e+15
## SectionName.nb.fctrTechnology -3.343e+13
## SectionName.nb.fctrMagazine -3.874e+15
## SectionName.nb.fctrMultimedia -2.338e+15
## `SectionName.nb.fctrmyMisc::` -5.410e+14
## SectionName.nb.fctrTravel -1.578e+15
## SectionName.nb.fctrU.S. -4.545e+15
## `SectionName.nb.fctrN.Y. / Region` -3.502e+14
## `SectionName.nb.fctrDaily Clip Report::` NA
## SectionName.nb.fctrOpen -4.561e+15
## `SectionName.nb.fctrReaders Respond::` 5.176e+14
## SectionName.nb.fctrSports -4.265e+15
## SectionName.nb.fctrNational -3.758e+15
## `SectionName.nb.fctrVerbatim::` NA
## `SectionName.nb.fctrFirst Draft::` NA
## `SectionName.nb.fctrToday in Politics::` NA
## `SectionName.nb.fctrReporter's Notebook::` NA
## SectionName.nb.fctrCulture NA
## `SectionName.nb.fctrThe Daily Gift::` NA
## H.num.chars.log 4.094e+13
## NewsDesk.nb.fctrCulture NA
## NewsDesk.nb.fctrScience NA
## NewsDesk.nb.fctrOpEd NA
## NewsDesk.nb.fctrForeign NA
## NewsDesk.nb.fctrStyles 5.820e+15
## NewsDesk.nb.fctrTStyle -8.652e+13
## NewsDesk.nb.fctrMagazine NA
## NewsDesk.nb.fctrmyMultimedia NA
## `NewsDesk.nb.fctrmyMisc::` NA
## NewsDesk.nb.fctrTravel NA
## NewsDesk.nb.fctrmyEducation 4.239e+14
## NewsDesk.nb.fctrMetro NA
## `NewsDesk.nb.fctrDaily Clip Report::` NA
## `NewsDesk.nb.fctrReaders Respond::` NA
## NewsDesk.nb.fctrNational NA
## NewsDesk.nb.fctrSports NA
## `NewsDesk.nb.fctrVerbatim::` NA
## `NewsDesk.nb.fctrFirst Draft::` NA
## `NewsDesk.nb.fctrToday in Politics::` NA
## `NewsDesk.nb.fctrReporter's Notebook::` NA
## `NewsDesk.nb.fctrThe Daily Gift::` NA
## H.num.words.log 7.372e+13
## H.num.words.unq.log -4.308e+14
## `SubsectionName.nb.fctrCulture::Arts` NA
## SubsectionName.nb.fctrDealbook 8.108e+14
## `SubsectionName.nb.fctrScience::Health` NA
## `SubsectionName.nb.fctrOpEd::Opinion` -1.365e+15
## `SubsectionName.nb.fctrRoom For Debate` -4.955e+15
## `SubsectionName.nb.fctrForeign::World` -1.308e+15
## `SubsectionName.nb.fctrFashion & Style` NA
## `SubsectionName.nb.fctrTStyle::TStyle` NA
## `SubsectionName.nb.fctrAsia Pacific` NA
## `SubsectionName.nb.fctrBusiness::Technology` NA
## `SubsectionName.nb.fctrMagazine::Magazine` NA
## `SubsectionName.nb.fctrmyMultimedia::Multimedia` NA
## `SubsectionName.nb.fctrmyMisc::myMisc::` NA
## `SubsectionName.nb.fctrTravel::Travel` NA
## SubsectionName.nb.fctrEducation NA
## `SubsectionName.nb.fctrThe Public Editor` NA
## `SubsectionName.nb.fctrSmall Business` NA
## `SubsectionName.nb.fctrMetro::N.Y. / Region` NA
## `SubsectionName.nb.fctrDaily Clip Report::Daily Clip Report::` NA
## `SubsectionName.nb.fctrmyMisc::Open` NA
## `SubsectionName.nb.fctrTStyle::Technology` NA
## `SubsectionName.nb.fctrReaders Respond::Readers Respond::` NA
## SubsectionName.nb.fctrPolitics NA
## `SubsectionName.nb.fctrSports::Sports` NA
## `SubsectionName.nb.fctrNational::National` NA
## `SubsectionName.nb.fctrVerbatim::Verbatim::` NA
## `SubsectionName.nb.fctrFirst Draft::First Draft::` NA
## `SubsectionName.nb.fctrToday in Politics::Today in Politics::` NA
## `SubsectionName.nb.fctrReporter's Notebook::Reporter's Notebook::` NA
## `SubsectionName.nb.fctrCulture::Culture` NA
## `SubsectionName.nb.fctrmyMisc::U.S.` NA
## `SubsectionName.nb.fctrThe Daily Gift::The Daily Gift::` NA
## `SubsectionName.nb.fctrmyMisc::Travel` NA
## `SubsectionName.nb.fctrmyMultimedia::N.Y. / Region` NA
## A.num.chars.log -3.441e+15
## S.num.chars.log 3.234e+15
## A.num.words.log -3.558e+16
## S.num.words.log 3.687e+16
## A.num.words.unq.log 3.425e+16
## S.num.words.unq.log -3.578e+16
## Std. Error
## (Intercept) 2.411e+07
## WordCount.log 1.268e+06
## PubDate.hour 3.912e+05
## H.is.question 4.704e+06
## PubDate.apm.fctrpm 3.669e+06
## A.can 7.817e+07
## S.can 7.873e+07
## H.has.ebola 9.131e+06
## S.make 5.281e+06
## A.make NA
## S.one 1.075e+08
## S.state 5.581e+06
## A.state NA
## A.one 1.076e+08
## S.said 5.264e+06
## A.said NA
## .rnorm 1.009e+06
## `PubDate.date.fctr(7,13]` 3.148e+06
## `PubDate.date.fctr(13,19]` 3.118e+06
## `PubDate.date.fctr(19,25]` 3.034e+06
## `PubDate.date.fctr(25,31]` 3.382e+06
## PubDate.second 5.844e+04
## S.presid 5.117e+06
## A.presid NA
## S.take 9.747e+07
## A.take 9.732e+07
## PubDate.minute 5.846e+04
## S.new 5.272e+07
## A.new 5.260e+07
## PubDate.wkday.fctr1 5.336e+06
## PubDate.wkday.fctr2 5.349e+06
## PubDate.wkday.fctr3 5.335e+06
## PubDate.wkday.fctr4 5.327e+06
## PubDate.wkday.fctr5 5.359e+06
## PubDate.wkday.fctr6 7.925e+06
## S.day 7.461e+07
## A.day 7.437e+07
## H.X2014 1.001e+07
## S.show 5.411e+06
## A.show NA
## S.report 6.007e+06
## A.report NA
## S.share 6.358e+06
## A.share NA
## S.year 4.735e+06
## A.year NA
## S.compani 4.488e+06
## A.compani NA
## H.new 5.582e+06
## S.first 5.691e+06
## A.first NA
## S.time 7.015e+07
## A.time 6.999e+07
## H.newyork 7.686e+06
## S.articl 1.008e+07
## A.articl NA
## S.will 7.126e+07
## A.will 7.125e+07
## H.day 9.117e+06
## S.newyork 5.024e+06
## A.newyork NA
## H.today 2.764e+07
## H.report 1.026e+07
## S.intern 9.817e+06
## A.intern NA
## H.week 1.169e+07
## H.fashion 1.436e+07
## S.week 5.014e+06
## A.week NA
## S.fashion 7.063e+06
## A.fashion NA
## `Headline.pfx.fctr19[0-9][0-9]::` 1.773e+07
## `Headline.pfx.fctrDaily Report::` 1.605e+07
## `Headline.pfx.fctr.*Fashion Week::` 2.004e+07
## `Headline.pfx.fctrWhat We're::` 1.475e+07
## `Headline.pfx.fctrPictures of the (Day|Year|.)::` 1.762e+07
## `Headline.pfx.fctrToday in Small Business::` 3.105e+07
## `Headline.pfx.fctrDaily Clip Report::` 2.053e+07
## `Headline.pfx.fctrMorning Agenda::` 1.147e+07
## `Headline.pfx.fctrNew York Today::` 3.105e+07
## `Headline.pfx.fctr6 Q's About the News::` 1.385e+07
## `Headline.pfx.fctrTest Yourself::` 1.520e+07
## `Headline.pfx.fctrWord of the Day::` 1.985e+07
## `Headline.pfx.fctrmyTech::` 8.834e+06
## `Headline.pfx.fctrYour Turn::` 2.483e+07
## `Headline.pfx.fctrReaders Respond::` 2.813e+07
## `Headline.pfx.fctrAsk Well::` 2.889e+07
## `Headline.pfx.fctrQuiz(.*)([?=|]|[?=:]::` 2.698e+07
## `Headline.pfx.fctrOn This Day::` 2.573e+07
## `Headline.pfx.fctrVerbatim::` 1.691e+07
## `Headline.pfx.fctrFirst Draft::` 1.376e+07
## `Headline.pfx.fctrToday in Politics::` 3.137e+07
## `Headline.pfx.fctrReporter's Notebook::` 3.041e+07
## `Headline.pfx.fctrmyFood::` 1.455e+07
## `Headline.pfx.fctrThe Daily Gift::` 1.830e+07
## SectionName.nb.fctrArts 8.897e+06
## `SectionName.nb.fctrBusiness Day` 1.246e+07
## SectionName.nb.fctrHealth 1.021e+07
## SectionName.nb.fctrOpinion 2.001e+07
## SectionName.nb.fctrWorld 1.072e+07
## SectionName.nb.fctrStyles 5.058e+07
## SectionName.nb.fctrTStyle 4.937e+07
## SectionName.nb.fctrTechnology 1.029e+07
## SectionName.nb.fctrMagazine 1.719e+07
## SectionName.nb.fctrMultimedia 1.203e+07
## `SectionName.nb.fctrmyMisc::` 8.659e+06
## SectionName.nb.fctrTravel 1.092e+07
## SectionName.nb.fctrU.S. 4.905e+07
## `SectionName.nb.fctrN.Y. / Region` 1.081e+07
## `SectionName.nb.fctrDaily Clip Report::` NA
## SectionName.nb.fctrOpen 4.856e+07
## `SectionName.nb.fctrReaders Respond::` 3.614e+07
## SectionName.nb.fctrSports 6.976e+07
## SectionName.nb.fctrNational 4.849e+07
## `SectionName.nb.fctrVerbatim::` NA
## `SectionName.nb.fctrFirst Draft::` NA
## `SectionName.nb.fctrToday in Politics::` NA
## `SectionName.nb.fctrReporter's Notebook::` NA
## SectionName.nb.fctrCulture NA
## `SectionName.nb.fctrThe Daily Gift::` NA
## H.num.chars.log 6.544e+06
## NewsDesk.nb.fctrCulture NA
## NewsDesk.nb.fctrScience NA
## NewsDesk.nb.fctrOpEd NA
## NewsDesk.nb.fctrForeign NA
## NewsDesk.nb.fctrStyles 4.911e+07
## NewsDesk.nb.fctrTStyle 4.851e+07
## NewsDesk.nb.fctrMagazine NA
## NewsDesk.nb.fctrmyMultimedia NA
## `NewsDesk.nb.fctrmyMisc::` NA
## NewsDesk.nb.fctrTravel NA
## NewsDesk.nb.fctrmyEducation 4.944e+07
## NewsDesk.nb.fctrMetro NA
## `NewsDesk.nb.fctrDaily Clip Report::` NA
## `NewsDesk.nb.fctrReaders Respond::` NA
## NewsDesk.nb.fctrNational NA
## NewsDesk.nb.fctrSports NA
## `NewsDesk.nb.fctrVerbatim::` NA
## `NewsDesk.nb.fctrFirst Draft::` NA
## `NewsDesk.nb.fctrToday in Politics::` NA
## `NewsDesk.nb.fctrReporter's Notebook::` NA
## `NewsDesk.nb.fctrThe Daily Gift::` NA
## H.num.words.log 3.800e+07
## H.num.words.unq.log 3.751e+07
## `SubsectionName.nb.fctrCulture::Arts` NA
## SubsectionName.nb.fctrDealbook 9.743e+06
## `SubsectionName.nb.fctrScience::Health` NA
## `SubsectionName.nb.fctrOpEd::Opinion` 1.860e+07
## `SubsectionName.nb.fctrRoom For Debate` 2.103e+07
## `SubsectionName.nb.fctrForeign::World` 1.897e+07
## `SubsectionName.nb.fctrFashion & Style` NA
## `SubsectionName.nb.fctrTStyle::TStyle` NA
## `SubsectionName.nb.fctrAsia Pacific` NA
## `SubsectionName.nb.fctrBusiness::Technology` NA
## `SubsectionName.nb.fctrMagazine::Magazine` NA
## `SubsectionName.nb.fctrmyMultimedia::Multimedia` NA
## `SubsectionName.nb.fctrmyMisc::myMisc::` NA
## `SubsectionName.nb.fctrTravel::Travel` NA
## SubsectionName.nb.fctrEducation NA
## `SubsectionName.nb.fctrThe Public Editor` NA
## `SubsectionName.nb.fctrSmall Business` NA
## `SubsectionName.nb.fctrMetro::N.Y. / Region` NA
## `SubsectionName.nb.fctrDaily Clip Report::Daily Clip Report::` NA
## `SubsectionName.nb.fctrmyMisc::Open` NA
## `SubsectionName.nb.fctrTStyle::Technology` NA
## `SubsectionName.nb.fctrReaders Respond::Readers Respond::` NA
## SubsectionName.nb.fctrPolitics NA
## `SubsectionName.nb.fctrSports::Sports` NA
## `SubsectionName.nb.fctrNational::National` NA
## `SubsectionName.nb.fctrVerbatim::Verbatim::` NA
## `SubsectionName.nb.fctrFirst Draft::First Draft::` NA
## `SubsectionName.nb.fctrToday in Politics::Today in Politics::` NA
## `SubsectionName.nb.fctrReporter's Notebook::Reporter's Notebook::` NA
## `SubsectionName.nb.fctrCulture::Culture` NA
## `SubsectionName.nb.fctrmyMisc::U.S.` NA
## `SubsectionName.nb.fctrThe Daily Gift::The Daily Gift::` NA
## `SubsectionName.nb.fctrmyMisc::Travel` NA
## `SubsectionName.nb.fctrmyMultimedia::N.Y. / Region` NA
## A.num.chars.log 1.135e+08
## S.num.chars.log 1.135e+08
## A.num.words.log 5.194e+08
## S.num.words.log 5.194e+08
## A.num.words.unq.log 5.232e+08
## S.num.words.unq.log 5.227e+08
## z value
## (Intercept) -108463114
## WordCount.log 530638744
## PubDate.hour 123877608
## H.is.question 96042631
## PubDate.apm.fctrpm -88410962
## A.can 84787786
## S.can -86872208
## H.has.ebola -42099411
## S.make -27114441
## A.make NA
## S.one -30045586
## S.state 15315896
## A.state NA
## A.one 30145374
## S.said 89218060
## A.said NA
## .rnorm 39684282
## `PubDate.date.fctr(7,13]` -28993199
## `PubDate.date.fctr(13,19]` -11592547
## `PubDate.date.fctr(19,25]` -35267958
## `PubDate.date.fctr(25,31]` -140457
## PubDate.second 12359909
## S.presid 9135840
## A.presid NA
## S.take -55108798
## A.take 55281262
## PubDate.minute -17083823
## S.new -1320386
## A.new 1398047
## PubDate.wkday.fctr1 -22244034
## PubDate.wkday.fctr2 -63753257
## PubDate.wkday.fctr3 -24403740
## PubDate.wkday.fctr4 -83300797
## PubDate.wkday.fctr5 -54722149
## PubDate.wkday.fctr6 29511245
## S.day -66149951
## A.day 67296163
## H.X2014 -4195610
## S.show -13584517
## A.show NA
## S.report 11034720
## A.report NA
## S.share -181752604
## A.share NA
## S.year -74294107
## A.year NA
## S.compani -86468102
## A.compani NA
## H.new -56342852
## S.first 23714955
## A.first NA
## S.time 69782395
## A.time -69090071
## H.newyork -57977541
## S.articl -19398127
## A.articl NA
## S.will 6414046
## A.will -6811109
## H.day -3896048
## S.newyork 14501880
## A.newyork NA
## H.today -81269863
## H.report -60111853
## S.intern 7217909
## A.intern NA
## H.week -16939209
## H.fashion 36204323
## S.week -17384037
## A.week NA
## S.fashion 35173997
## A.fashion NA
## `Headline.pfx.fctr19[0-9][0-9]::` 167545851
## `Headline.pfx.fctrDaily Report::` 4882184
## `Headline.pfx.fctr.*Fashion Week::` -98323858
## `Headline.pfx.fctrWhat We're::` -367679546
## `Headline.pfx.fctrPictures of the (Day|Year|.)::` 13494476
## `Headline.pfx.fctrToday in Small Business::` -10395218
## `Headline.pfx.fctrDaily Clip Report::` -137929413
## `Headline.pfx.fctrMorning Agenda::` -304501634
## `Headline.pfx.fctrNew York Today::` 37839598
## `Headline.pfx.fctr6 Q's About the News::` 266586625
## `Headline.pfx.fctrTest Yourself::` 127621716
## `Headline.pfx.fctrWord of the Day::` 176205073
## `Headline.pfx.fctrmyTech::` 31918172
## `Headline.pfx.fctrYour Turn::` 12189481
## `Headline.pfx.fctrReaders Respond::` 26148336
## `Headline.pfx.fctrAsk Well::` -10777858
## `Headline.pfx.fctrQuiz(.*)([?=|]|[?=:]::` 200474621
## `Headline.pfx.fctrOn This Day::` 19572113
## `Headline.pfx.fctrVerbatim::` -157367910
## `Headline.pfx.fctrFirst Draft::` -222139177
## `Headline.pfx.fctrToday in Politics::` -6601672
## `Headline.pfx.fctrReporter's Notebook::` 11892863
## `Headline.pfx.fctrmyFood::` -34185924
## `Headline.pfx.fctrThe Daily Gift::` -35453381
## SectionName.nb.fctrArts -115128770
## `SectionName.nb.fctrBusiness Day` -93077898
## SectionName.nb.fctrHealth 74199586
## SectionName.nb.fctrOpinion 113482589
## SectionName.nb.fctrWorld -240451661
## SectionName.nb.fctrStyles -154924121
## SectionName.nb.fctrTStyle -27653037
## SectionName.nb.fctrTechnology -3248331
## SectionName.nb.fctrMagazine -225397623
## SectionName.nb.fctrMultimedia -194363747
## `SectionName.nb.fctrmyMisc::` -62472380
## SectionName.nb.fctrTravel -144423501
## SectionName.nb.fctrU.S. -92647181
## `SectionName.nb.fctrN.Y. / Region` -32413813
## `SectionName.nb.fctrDaily Clip Report::` NA
## SectionName.nb.fctrOpen -93926488
## `SectionName.nb.fctrReaders Respond::` 14321487
## SectionName.nb.fctrSports -61135923
## SectionName.nb.fctrNational -77508359
## `SectionName.nb.fctrVerbatim::` NA
## `SectionName.nb.fctrFirst Draft::` NA
## `SectionName.nb.fctrToday in Politics::` NA
## `SectionName.nb.fctrReporter's Notebook::` NA
## SectionName.nb.fctrCulture NA
## `SectionName.nb.fctrThe Daily Gift::` NA
## H.num.chars.log 6256020
## NewsDesk.nb.fctrCulture NA
## NewsDesk.nb.fctrScience NA
## NewsDesk.nb.fctrOpEd NA
## NewsDesk.nb.fctrForeign NA
## NewsDesk.nb.fctrStyles 118507248
## NewsDesk.nb.fctrTStyle -1783805
## NewsDesk.nb.fctrMagazine NA
## NewsDesk.nb.fctrmyMultimedia NA
## `NewsDesk.nb.fctrmyMisc::` NA
## NewsDesk.nb.fctrTravel NA
## NewsDesk.nb.fctrmyEducation 8574209
## NewsDesk.nb.fctrMetro NA
## `NewsDesk.nb.fctrDaily Clip Report::` NA
## `NewsDesk.nb.fctrReaders Respond::` NA
## NewsDesk.nb.fctrNational NA
## NewsDesk.nb.fctrSports NA
## `NewsDesk.nb.fctrVerbatim::` NA
## `NewsDesk.nb.fctrFirst Draft::` NA
## `NewsDesk.nb.fctrToday in Politics::` NA
## `NewsDesk.nb.fctrReporter's Notebook::` NA
## `NewsDesk.nb.fctrThe Daily Gift::` NA
## H.num.words.log 1939940
## H.num.words.unq.log -11486586
## `SubsectionName.nb.fctrCulture::Arts` NA
## SubsectionName.nb.fctrDealbook 83225252
## `SubsectionName.nb.fctrScience::Health` NA
## `SubsectionName.nb.fctrOpEd::Opinion` -73358404
## `SubsectionName.nb.fctrRoom For Debate` -235643377
## `SubsectionName.nb.fctrForeign::World` -68956040
## `SubsectionName.nb.fctrFashion & Style` NA
## `SubsectionName.nb.fctrTStyle::TStyle` NA
## `SubsectionName.nb.fctrAsia Pacific` NA
## `SubsectionName.nb.fctrBusiness::Technology` NA
## `SubsectionName.nb.fctrMagazine::Magazine` NA
## `SubsectionName.nb.fctrmyMultimedia::Multimedia` NA
## `SubsectionName.nb.fctrmyMisc::myMisc::` NA
## `SubsectionName.nb.fctrTravel::Travel` NA
## SubsectionName.nb.fctrEducation NA
## `SubsectionName.nb.fctrThe Public Editor` NA
## `SubsectionName.nb.fctrSmall Business` NA
## `SubsectionName.nb.fctrMetro::N.Y. / Region` NA
## `SubsectionName.nb.fctrDaily Clip Report::Daily Clip Report::` NA
## `SubsectionName.nb.fctrmyMisc::Open` NA
## `SubsectionName.nb.fctrTStyle::Technology` NA
## `SubsectionName.nb.fctrReaders Respond::Readers Respond::` NA
## SubsectionName.nb.fctrPolitics NA
## `SubsectionName.nb.fctrSports::Sports` NA
## `SubsectionName.nb.fctrNational::National` NA
## `SubsectionName.nb.fctrVerbatim::Verbatim::` NA
## `SubsectionName.nb.fctrFirst Draft::First Draft::` NA
## `SubsectionName.nb.fctrToday in Politics::Today in Politics::` NA
## `SubsectionName.nb.fctrReporter's Notebook::Reporter's Notebook::` NA
## `SubsectionName.nb.fctrCulture::Culture` NA
## `SubsectionName.nb.fctrmyMisc::U.S.` NA
## `SubsectionName.nb.fctrThe Daily Gift::The Daily Gift::` NA
## `SubsectionName.nb.fctrmyMisc::Travel` NA
## `SubsectionName.nb.fctrmyMultimedia::N.Y. / Region` NA
## A.num.chars.log -30316691
## S.num.chars.log 28495933
## A.num.words.log -68501832
## S.num.words.log 70973775
## A.num.words.unq.log 65453618
## S.num.words.unq.log -68452469
## Pr(>|z|)
## (Intercept) <2e-16
## WordCount.log <2e-16
## PubDate.hour <2e-16
## H.is.question <2e-16
## PubDate.apm.fctrpm <2e-16
## A.can <2e-16
## S.can <2e-16
## H.has.ebola <2e-16
## S.make <2e-16
## A.make NA
## S.one <2e-16
## S.state <2e-16
## A.state NA
## A.one <2e-16
## S.said <2e-16
## A.said NA
## .rnorm <2e-16
## `PubDate.date.fctr(7,13]` <2e-16
## `PubDate.date.fctr(13,19]` <2e-16
## `PubDate.date.fctr(19,25]` <2e-16
## `PubDate.date.fctr(25,31]` <2e-16
## PubDate.second <2e-16
## S.presid <2e-16
## A.presid NA
## S.take <2e-16
## A.take <2e-16
## PubDate.minute <2e-16
## S.new <2e-16
## A.new <2e-16
## PubDate.wkday.fctr1 <2e-16
## PubDate.wkday.fctr2 <2e-16
## PubDate.wkday.fctr3 <2e-16
## PubDate.wkday.fctr4 <2e-16
## PubDate.wkday.fctr5 <2e-16
## PubDate.wkday.fctr6 <2e-16
## S.day <2e-16
## A.day <2e-16
## H.X2014 <2e-16
## S.show <2e-16
## A.show NA
## S.report <2e-16
## A.report NA
## S.share <2e-16
## A.share NA
## S.year <2e-16
## A.year NA
## S.compani <2e-16
## A.compani NA
## H.new <2e-16
## S.first <2e-16
## A.first NA
## S.time <2e-16
## A.time <2e-16
## H.newyork <2e-16
## S.articl <2e-16
## A.articl NA
## S.will <2e-16
## A.will <2e-16
## H.day <2e-16
## S.newyork <2e-16
## A.newyork NA
## H.today <2e-16
## H.report <2e-16
## S.intern <2e-16
## A.intern NA
## H.week <2e-16
## H.fashion <2e-16
## S.week <2e-16
## A.week NA
## S.fashion <2e-16
## A.fashion NA
## `Headline.pfx.fctr19[0-9][0-9]::` <2e-16
## `Headline.pfx.fctrDaily Report::` <2e-16
## `Headline.pfx.fctr.*Fashion Week::` <2e-16
## `Headline.pfx.fctrWhat We're::` <2e-16
## `Headline.pfx.fctrPictures of the (Day|Year|.)::` <2e-16
## `Headline.pfx.fctrToday in Small Business::` <2e-16
## `Headline.pfx.fctrDaily Clip Report::` <2e-16
## `Headline.pfx.fctrMorning Agenda::` <2e-16
## `Headline.pfx.fctrNew York Today::` <2e-16
## `Headline.pfx.fctr6 Q's About the News::` <2e-16
## `Headline.pfx.fctrTest Yourself::` <2e-16
## `Headline.pfx.fctrWord of the Day::` <2e-16
## `Headline.pfx.fctrmyTech::` <2e-16
## `Headline.pfx.fctrYour Turn::` <2e-16
## `Headline.pfx.fctrReaders Respond::` <2e-16
## `Headline.pfx.fctrAsk Well::` <2e-16
## `Headline.pfx.fctrQuiz(.*)([?=|]|[?=:]::` <2e-16
## `Headline.pfx.fctrOn This Day::` <2e-16
## `Headline.pfx.fctrVerbatim::` <2e-16
## `Headline.pfx.fctrFirst Draft::` <2e-16
## `Headline.pfx.fctrToday in Politics::` <2e-16
## `Headline.pfx.fctrReporter's Notebook::` <2e-16
## `Headline.pfx.fctrmyFood::` <2e-16
## `Headline.pfx.fctrThe Daily Gift::` <2e-16
## SectionName.nb.fctrArts <2e-16
## `SectionName.nb.fctrBusiness Day` <2e-16
## SectionName.nb.fctrHealth <2e-16
## SectionName.nb.fctrOpinion <2e-16
## SectionName.nb.fctrWorld <2e-16
## SectionName.nb.fctrStyles <2e-16
## SectionName.nb.fctrTStyle <2e-16
## SectionName.nb.fctrTechnology <2e-16
## SectionName.nb.fctrMagazine <2e-16
## SectionName.nb.fctrMultimedia <2e-16
## `SectionName.nb.fctrmyMisc::` <2e-16
## SectionName.nb.fctrTravel <2e-16
## SectionName.nb.fctrU.S. <2e-16
## `SectionName.nb.fctrN.Y. / Region` <2e-16
## `SectionName.nb.fctrDaily Clip Report::` NA
## SectionName.nb.fctrOpen <2e-16
## `SectionName.nb.fctrReaders Respond::` <2e-16
## SectionName.nb.fctrSports <2e-16
## SectionName.nb.fctrNational <2e-16
## `SectionName.nb.fctrVerbatim::` NA
## `SectionName.nb.fctrFirst Draft::` NA
## `SectionName.nb.fctrToday in Politics::` NA
## `SectionName.nb.fctrReporter's Notebook::` NA
## SectionName.nb.fctrCulture NA
## `SectionName.nb.fctrThe Daily Gift::` NA
## H.num.chars.log <2e-16
## NewsDesk.nb.fctrCulture NA
## NewsDesk.nb.fctrScience NA
## NewsDesk.nb.fctrOpEd NA
## NewsDesk.nb.fctrForeign NA
## NewsDesk.nb.fctrStyles <2e-16
## NewsDesk.nb.fctrTStyle <2e-16
## NewsDesk.nb.fctrMagazine NA
## NewsDesk.nb.fctrmyMultimedia NA
## `NewsDesk.nb.fctrmyMisc::` NA
## NewsDesk.nb.fctrTravel NA
## NewsDesk.nb.fctrmyEducation <2e-16
## NewsDesk.nb.fctrMetro NA
## `NewsDesk.nb.fctrDaily Clip Report::` NA
## `NewsDesk.nb.fctrReaders Respond::` NA
## NewsDesk.nb.fctrNational NA
## NewsDesk.nb.fctrSports NA
## `NewsDesk.nb.fctrVerbatim::` NA
## `NewsDesk.nb.fctrFirst Draft::` NA
## `NewsDesk.nb.fctrToday in Politics::` NA
## `NewsDesk.nb.fctrReporter's Notebook::` NA
## `NewsDesk.nb.fctrThe Daily Gift::` NA
## H.num.words.log <2e-16
## H.num.words.unq.log <2e-16
## `SubsectionName.nb.fctrCulture::Arts` NA
## SubsectionName.nb.fctrDealbook <2e-16
## `SubsectionName.nb.fctrScience::Health` NA
## `SubsectionName.nb.fctrOpEd::Opinion` <2e-16
## `SubsectionName.nb.fctrRoom For Debate` <2e-16
## `SubsectionName.nb.fctrForeign::World` <2e-16
## `SubsectionName.nb.fctrFashion & Style` NA
## `SubsectionName.nb.fctrTStyle::TStyle` NA
## `SubsectionName.nb.fctrAsia Pacific` NA
## `SubsectionName.nb.fctrBusiness::Technology` NA
## `SubsectionName.nb.fctrMagazine::Magazine` NA
## `SubsectionName.nb.fctrmyMultimedia::Multimedia` NA
## `SubsectionName.nb.fctrmyMisc::myMisc::` NA
## `SubsectionName.nb.fctrTravel::Travel` NA
## SubsectionName.nb.fctrEducation NA
## `SubsectionName.nb.fctrThe Public Editor` NA
## `SubsectionName.nb.fctrSmall Business` NA
## `SubsectionName.nb.fctrMetro::N.Y. / Region` NA
## `SubsectionName.nb.fctrDaily Clip Report::Daily Clip Report::` NA
## `SubsectionName.nb.fctrmyMisc::Open` NA
## `SubsectionName.nb.fctrTStyle::Technology` NA
## `SubsectionName.nb.fctrReaders Respond::Readers Respond::` NA
## SubsectionName.nb.fctrPolitics NA
## `SubsectionName.nb.fctrSports::Sports` NA
## `SubsectionName.nb.fctrNational::National` NA
## `SubsectionName.nb.fctrVerbatim::Verbatim::` NA
## `SubsectionName.nb.fctrFirst Draft::First Draft::` NA
## `SubsectionName.nb.fctrToday in Politics::Today in Politics::` NA
## `SubsectionName.nb.fctrReporter's Notebook::Reporter's Notebook::` NA
## `SubsectionName.nb.fctrCulture::Culture` NA
## `SubsectionName.nb.fctrmyMisc::U.S.` NA
## `SubsectionName.nb.fctrThe Daily Gift::The Daily Gift::` NA
## `SubsectionName.nb.fctrmyMisc::Travel` NA
## `SubsectionName.nb.fctrmyMultimedia::N.Y. / Region` NA
## A.num.chars.log <2e-16
## S.num.chars.log <2e-16
## A.num.words.log <2e-16
## S.num.words.log <2e-16
## A.num.words.unq.log <2e-16
## S.num.words.unq.log <2e-16
##
## (Intercept) ***
## WordCount.log ***
## PubDate.hour ***
## H.is.question ***
## PubDate.apm.fctrpm ***
## A.can ***
## S.can ***
## H.has.ebola ***
## S.make ***
## A.make
## S.one ***
## S.state ***
## A.state
## A.one ***
## S.said ***
## A.said
## .rnorm ***
## `PubDate.date.fctr(7,13]` ***
## `PubDate.date.fctr(13,19]` ***
## `PubDate.date.fctr(19,25]` ***
## `PubDate.date.fctr(25,31]` ***
## PubDate.second ***
## S.presid ***
## A.presid
## S.take ***
## A.take ***
## PubDate.minute ***
## S.new ***
## A.new ***
## PubDate.wkday.fctr1 ***
## PubDate.wkday.fctr2 ***
## PubDate.wkday.fctr3 ***
## PubDate.wkday.fctr4 ***
## PubDate.wkday.fctr5 ***
## PubDate.wkday.fctr6 ***
## S.day ***
## A.day ***
## H.X2014 ***
## S.show ***
## A.show
## S.report ***
## A.report
## S.share ***
## A.share
## S.year ***
## A.year
## S.compani ***
## A.compani
## H.new ***
## S.first ***
## A.first
## S.time ***
## A.time ***
## H.newyork ***
## S.articl ***
## A.articl
## S.will ***
## A.will ***
## H.day ***
## S.newyork ***
## A.newyork
## H.today ***
## H.report ***
## S.intern ***
## A.intern
## H.week ***
## H.fashion ***
## S.week ***
## A.week
## S.fashion ***
## A.fashion
## `Headline.pfx.fctr19[0-9][0-9]::` ***
## `Headline.pfx.fctrDaily Report::` ***
## `Headline.pfx.fctr.*Fashion Week::` ***
## `Headline.pfx.fctrWhat We're::` ***
## `Headline.pfx.fctrPictures of the (Day|Year|.)::` ***
## `Headline.pfx.fctrToday in Small Business::` ***
## `Headline.pfx.fctrDaily Clip Report::` ***
## `Headline.pfx.fctrMorning Agenda::` ***
## `Headline.pfx.fctrNew York Today::` ***
## `Headline.pfx.fctr6 Q's About the News::` ***
## `Headline.pfx.fctrTest Yourself::` ***
## `Headline.pfx.fctrWord of the Day::` ***
## `Headline.pfx.fctrmyTech::` ***
## `Headline.pfx.fctrYour Turn::` ***
## `Headline.pfx.fctrReaders Respond::` ***
## `Headline.pfx.fctrAsk Well::` ***
## `Headline.pfx.fctrQuiz(.*)([?=|]|[?=:]::` ***
## `Headline.pfx.fctrOn This Day::` ***
## `Headline.pfx.fctrVerbatim::` ***
## `Headline.pfx.fctrFirst Draft::` ***
## `Headline.pfx.fctrToday in Politics::` ***
## `Headline.pfx.fctrReporter's Notebook::` ***
## `Headline.pfx.fctrmyFood::` ***
## `Headline.pfx.fctrThe Daily Gift::` ***
## SectionName.nb.fctrArts ***
## `SectionName.nb.fctrBusiness Day` ***
## SectionName.nb.fctrHealth ***
## SectionName.nb.fctrOpinion ***
## SectionName.nb.fctrWorld ***
## SectionName.nb.fctrStyles ***
## SectionName.nb.fctrTStyle ***
## SectionName.nb.fctrTechnology ***
## SectionName.nb.fctrMagazine ***
## SectionName.nb.fctrMultimedia ***
## `SectionName.nb.fctrmyMisc::` ***
## SectionName.nb.fctrTravel ***
## SectionName.nb.fctrU.S. ***
## `SectionName.nb.fctrN.Y. / Region` ***
## `SectionName.nb.fctrDaily Clip Report::`
## SectionName.nb.fctrOpen ***
## `SectionName.nb.fctrReaders Respond::` ***
## SectionName.nb.fctrSports ***
## SectionName.nb.fctrNational ***
## `SectionName.nb.fctrVerbatim::`
## `SectionName.nb.fctrFirst Draft::`
## `SectionName.nb.fctrToday in Politics::`
## `SectionName.nb.fctrReporter's Notebook::`
## SectionName.nb.fctrCulture
## `SectionName.nb.fctrThe Daily Gift::`
## H.num.chars.log ***
## NewsDesk.nb.fctrCulture
## NewsDesk.nb.fctrScience
## NewsDesk.nb.fctrOpEd
## NewsDesk.nb.fctrForeign
## NewsDesk.nb.fctrStyles ***
## NewsDesk.nb.fctrTStyle ***
## NewsDesk.nb.fctrMagazine
## NewsDesk.nb.fctrmyMultimedia
## `NewsDesk.nb.fctrmyMisc::`
## NewsDesk.nb.fctrTravel
## NewsDesk.nb.fctrmyEducation ***
## NewsDesk.nb.fctrMetro
## `NewsDesk.nb.fctrDaily Clip Report::`
## `NewsDesk.nb.fctrReaders Respond::`
## NewsDesk.nb.fctrNational
## NewsDesk.nb.fctrSports
## `NewsDesk.nb.fctrVerbatim::`
## `NewsDesk.nb.fctrFirst Draft::`
## `NewsDesk.nb.fctrToday in Politics::`
## `NewsDesk.nb.fctrReporter's Notebook::`
## `NewsDesk.nb.fctrThe Daily Gift::`
## H.num.words.log ***
## H.num.words.unq.log ***
## `SubsectionName.nb.fctrCulture::Arts`
## SubsectionName.nb.fctrDealbook ***
## `SubsectionName.nb.fctrScience::Health`
## `SubsectionName.nb.fctrOpEd::Opinion` ***
## `SubsectionName.nb.fctrRoom For Debate` ***
## `SubsectionName.nb.fctrForeign::World` ***
## `SubsectionName.nb.fctrFashion & Style`
## `SubsectionName.nb.fctrTStyle::TStyle`
## `SubsectionName.nb.fctrAsia Pacific`
## `SubsectionName.nb.fctrBusiness::Technology`
## `SubsectionName.nb.fctrMagazine::Magazine`
## `SubsectionName.nb.fctrmyMultimedia::Multimedia`
## `SubsectionName.nb.fctrmyMisc::myMisc::`
## `SubsectionName.nb.fctrTravel::Travel`
## SubsectionName.nb.fctrEducation
## `SubsectionName.nb.fctrThe Public Editor`
## `SubsectionName.nb.fctrSmall Business`
## `SubsectionName.nb.fctrMetro::N.Y. / Region`
## `SubsectionName.nb.fctrDaily Clip Report::Daily Clip Report::`
## `SubsectionName.nb.fctrmyMisc::Open`
## `SubsectionName.nb.fctrTStyle::Technology`
## `SubsectionName.nb.fctrReaders Respond::Readers Respond::`
## SubsectionName.nb.fctrPolitics
## `SubsectionName.nb.fctrSports::Sports`
## `SubsectionName.nb.fctrNational::National`
## `SubsectionName.nb.fctrVerbatim::Verbatim::`
## `SubsectionName.nb.fctrFirst Draft::First Draft::`
## `SubsectionName.nb.fctrToday in Politics::Today in Politics::`
## `SubsectionName.nb.fctrReporter's Notebook::Reporter's Notebook::`
## `SubsectionName.nb.fctrCulture::Culture`
## `SubsectionName.nb.fctrmyMisc::U.S.`
## `SubsectionName.nb.fctrThe Daily Gift::The Daily Gift::`
## `SubsectionName.nb.fctrmyMisc::Travel`
## `SubsectionName.nb.fctrmyMultimedia::N.Y. / Region`
## A.num.chars.log ***
## S.num.chars.log ***
## A.num.words.log ***
## S.num.words.log ***
## A.num.words.unq.log ***
## S.num.words.unq.log ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4042.7 on 4474 degrees of freedom
## Residual deviance: 29700.0 on 4361 degrees of freedom
## AIC: 29928
##
## Number of Fisher Scoring iterations: 25
##
## [1] " calling mypredict_mdl for fit:"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## threshold f.score
## 1 0.0 0.2867534
## 2 0.1 0.7146814
## 3 0.2 0.7146814
## 4 0.3 0.7146814
## 5 0.4 0.7146814
## 6 0.5 0.7146814
## 7 0.6 0.7146814
## 8 0.7 0.7146814
## 9 0.8 0.7146814
## 10 0.9 0.7146814
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.9000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Conditional.X.glm.N
## 1 N 3547
## 2 Y 233
## Popular.fctr.predict.Conditional.X.glm.Y
## 1 179
## 2 516
## Prediction
## Reference N Y
## N 3547 179
## Y 233 516
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.079330e-01 6.598835e-01 8.990819e-01 9.162474e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 6.051667e-48 9.024554e-03
## [1] " calling mypredict_mdl for OOB:"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## threshold f.score
## 1 0.0 0.2865473
## 2 0.1 0.7032641
## 3 0.2 0.7032641
## 4 0.3 0.7032641
## 5 0.4 0.7032641
## 6 0.5 0.7032641
## 7 0.6 0.7032641
## 8 0.7 0.7032641
## 9 0.8 0.7032641
## 10 0.9 0.7032641
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.9000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Conditional.X.glm.N
## 1 N 1620
## 2 Y 107
## Popular.fctr.predict.Conditional.X.glm.Y
## 1 93
## 2 237
## Prediction
## Reference N Y
## N 1620 93
## Y 107 237
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.027710e-01 6.451546e-01 8.891448e-01 9.152364e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 6.489691e-20 3.579707e-01
## model_id model_method
## 1 Conditional.X.glm glm
## feats
## 1 WordCount.log, PubDate.hour, H.is.question, PubDate.apm.fctr, A.can, S.can, H.has.ebola, S.make, A.make, S.one, S.state, A.state, A.one, S.said, A.said, .rnorm, PubDate.date.fctr, PubDate.second, S.presid, A.presid, S.take, A.take, PubDate.minute, S.new, A.new, PubDate.wkday.fctr, S.day, A.day, H.X2014, S.show, A.show, S.report, A.report, S.share, A.share, S.year, A.year, S.compani, A.compani, H.new, S.first, A.first, S.time, A.time, H.newyork, S.articl, A.articl, S.will, A.will, H.day, S.newyork, A.newyork, H.today, H.report, S.intern, A.intern, H.week, H.fashion, S.week, A.week, S.fashion, A.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, NewsDesk.nb.fctr, H.num.words.log, H.num.words.unq.log, SubsectionName.nb.fctr, A.num.chars.log, S.num.chars.log, A.num.words.log, S.num.words.log, A.num.words.unq.log, S.num.words.unq.log
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 25.178 7.683
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.8204389 0.9 0.7146814 0.7562325
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.8990819 0.9162474 0.3982561 0.8173314
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.9 0.7032641 0.902771
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB min.aic.fit
## 1 0.8891448 0.9152364 0.6451546 29927.97
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.2387515 0.3574867
## [1] "fitting model: Conditional.X.no.rnorm.rpart"
## [1] " indep_vars: WordCount.log, PubDate.hour, H.is.question, PubDate.apm.fctr, A.can, S.can, H.has.ebola, S.make, A.make, S.one, S.state, A.state, A.one, S.said, A.said, PubDate.date.fctr, PubDate.second, S.presid, A.presid, S.take, A.take, PubDate.minute, S.new, A.new, PubDate.wkday.fctr, S.day, A.day, H.X2014, S.show, A.show, S.report, A.report, S.share, A.share, S.year, A.year, S.compani, A.compani, H.new, S.first, A.first, S.time, A.time, H.newyork, S.articl, A.articl, S.will, A.will, H.day, S.newyork, A.newyork, H.today, H.report, S.intern, A.intern, H.week, H.fashion, S.week, A.week, S.fashion, A.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, NewsDesk.nb.fctr, H.num.words.log, H.num.words.unq.log, SubsectionName.nb.fctr, A.num.chars.log, S.num.chars.log, A.num.words.log, S.num.words.log, A.num.words.unq.log, S.num.words.unq.log"
## + Fold1: cp=0.03738
## - Fold1: cp=0.03738
## + Fold2: cp=0.03738
## - Fold2: cp=0.03738
## + Fold3: cp=0.03738
## - Fold3: cp=0.03738
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0394 on full training set
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 4475
##
## CP nsplit rel error
## 1 0.22162884 0 1.0000000
## 2 0.03938585 1 0.7783712
##
## Variable importance
## SubsectionName.nb.fctrOpEd::Opinion NewsDesk.nb.fctrOpEd
## 33 28
## SectionName.nb.fctrOpinion A.num.chars.log
## 28 4
## S.num.chars.log A.num.words.unq.log
## 4 4
##
## Node number 1: 4475 observations, complexity param=0.2216288
## predicted class=N expected loss=0.1673743 P(node) =1
## class counts: 3726 749
## probabilities: 0.833 0.167
## left son=2 (4101 obs) right son=3 (374 obs)
## Primary splits:
## SubsectionName.nb.fctrOpEd::Opinion < 0.5 to the left, improve=251.00800, (0 missing)
## SectionName.nb.fctrOpinion < 0.5 to the left, improve=223.18070, (0 missing)
## NewsDesk.nb.fctrOpEd < 0.5 to the left, improve=223.18070, (0 missing)
## WordCount.log < 6.528688 to the left, improve=109.59970, (0 missing)
## A.num.chars.log < 3.795426 to the right, improve= 95.79591, (0 missing)
## Surrogate splits:
## SectionName.nb.fctrOpinion < 0.5 to the left, agree=0.987, adj=0.845, (0 split)
## NewsDesk.nb.fctrOpEd < 0.5 to the left, agree=0.987, adj=0.845, (0 split)
## A.num.chars.log < 3.725621 to the right, agree=0.927, adj=0.123, (0 split)
## S.num.chars.log < 3.725621 to the right, agree=0.927, adj=0.123, (0 split)
## A.num.words.unq.log < 1.497866 to the right, agree=0.926, adj=0.115, (0 split)
##
## Node number 2: 4101 observations
## predicted class=N expected loss=0.1168008 P(node) =0.9164246
## class counts: 3622 479
## probabilities: 0.883 0.117
##
## Node number 3: 374 observations
## predicted class=Y expected loss=0.2780749 P(node) =0.08357542
## class counts: 104 270
## probabilities: 0.278 0.722
##
## n= 4475
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4475 749 N (0.8326257 0.1673743)
## 2) SubsectionName.nb.fctrOpEd::Opinion< 0.5 4101 479 N (0.8831992 0.1168008) *
## 3) SubsectionName.nb.fctrOpEd::Opinion>=0.5 374 104 Y (0.2780749 0.7219251) *
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.2867534
## 2 0.1 0.2867534
## 3 0.2 0.4808549
## 4 0.3 0.4808549
## 5 0.4 0.4808549
## 6 0.5 0.4808549
## 7 0.6 0.4808549
## 8 0.7 0.4808549
## 9 0.8 0.0000000
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.7000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Conditional.X.no.rnorm.rpart.N
## 1 N 3622
## 2 Y 479
## Popular.fctr.predict.Conditional.X.no.rnorm.rpart.Y
## 1 104
## 2 270
## Prediction
## Reference N Y
## N 3622 104
## Y 479 270
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.697207e-01 4.157169e-01 8.595050e-01 8.794511e-01 8.326257e-01
## AccuracyPValue McnemarPValue
## 3.957808e-12 4.084715e-54
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.2865473
## 2 0.1 0.2865473
## 3 0.2 0.5176909
## 4 0.3 0.5176909
## 5 0.4 0.5176909
## 6 0.5 0.5176909
## 7 0.6 0.5176909
## 8 0.7 0.5176909
## 9 0.8 0.0000000
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.7000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Conditional.X.no.rnorm.rpart.N
## 1 N 1659
## 2 Y 205
## Popular.fctr.predict.Conditional.X.no.rnorm.rpart.Y
## 1 54
## 2 139
## Prediction
## Reference N Y
## N 1659 54
## Y 205 139
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.740885e-01 4.517912e-01 8.589743e-01 8.881272e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 1.169537e-07 1.157426e-20
## model_id model_method
## 1 Conditional.X.no.rnorm.rpart rpart
## feats
## 1 WordCount.log, PubDate.hour, H.is.question, PubDate.apm.fctr, A.can, S.can, H.has.ebola, S.make, A.make, S.one, S.state, A.state, A.one, S.said, A.said, PubDate.date.fctr, PubDate.second, S.presid, A.presid, S.take, A.take, PubDate.minute, S.new, A.new, PubDate.wkday.fctr, S.day, A.day, H.X2014, S.show, A.show, S.report, A.report, S.share, A.share, S.year, A.year, S.compani, A.compani, H.new, S.first, A.first, S.time, A.time, H.newyork, S.articl, A.articl, S.will, A.will, H.day, S.newyork, A.newyork, H.today, H.report, S.intern, A.intern, H.week, H.fashion, S.week, A.week, S.fashion, A.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, NewsDesk.nb.fctr, H.num.words.log, H.num.words.unq.log, SubsectionName.nb.fctr, A.num.chars.log, S.num.chars.log, A.num.words.log, S.num.words.log, A.num.words.unq.log, S.num.words.unq.log
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 3 10.32 1.823
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.6662843 0.7 0.4808549 0.8744112
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.859505 0.8794511 0.4405605 0.6862731
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.7 0.5176909 0.8740885
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.8589743 0.8881272 0.4517912
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01425467 0.07384068
## [1] "fitting model: Conditional.X.no.rnorm.rf"
## [1] " indep_vars: WordCount.log, PubDate.hour, H.is.question, PubDate.apm.fctr, A.can, S.can, H.has.ebola, S.make, A.make, S.one, S.state, A.state, A.one, S.said, A.said, PubDate.date.fctr, PubDate.second, S.presid, A.presid, S.take, A.take, PubDate.minute, S.new, A.new, PubDate.wkday.fctr, S.day, A.day, H.X2014, S.show, A.show, S.report, A.report, S.share, A.share, S.year, A.year, S.compani, A.compani, H.new, S.first, A.first, S.time, A.time, H.newyork, S.articl, A.articl, S.will, A.will, H.day, S.newyork, A.newyork, H.today, H.report, S.intern, A.intern, H.week, H.fashion, S.week, A.week, S.fashion, A.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, NewsDesk.nb.fctr, H.num.words.log, H.num.words.unq.log, SubsectionName.nb.fctr, A.num.chars.log, S.num.chars.log, A.num.words.log, S.num.words.log, A.num.words.unq.log, S.num.words.unq.log"
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
## + : mtry= 2
## - : mtry= 2
## + : mtry= 92
## - : mtry= 92
## + : mtry=182
## - : mtry=182
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 92 on full training set
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 4475 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 8950 matrix numeric
## oob.times 4475 -none- numeric
## classes 2 -none- character
## importance 182 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 4475 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 182 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.28675345
## 2 0.1 0.79723257
## 3 0.2 0.92184615
## 4 0.3 0.97780679
## 5 0.4 1.00000000
## 6 0.5 1.00000000
## 7 0.6 1.00000000
## 8 0.7 0.93758865
## 9 0.8 0.81674566
## 10 0.9 0.63818182
## 11 1.0 0.01589404
## [1] "Classifier Probability Threshold: 0.6000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Conditional.X.no.rnorm.rf.N
## 1 N 3726
## 2 Y NA
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.Y
## 1 NA
## 2 749
## Prediction
## Reference N Y
## N 3726 0
## Y 0 749
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 1.0000000 1.0000000 0.9991760 1.0000000 0.8326257
## AccuracyPValue McnemarPValue
## 0.0000000 NaN
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.2865473
## 2 0.1 0.5945946
## 3 0.2 0.6718750
## 4 0.3 0.7192755
## 5 0.4 0.7311522
## 6 0.5 0.7001570
## 7 0.6 0.6768190
## 8 0.7 0.6281588
## 9 0.8 0.5091650
## 10 0.9 0.3213429
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.4000 to maximize f.score.OOB"
## Popular.fctr Popular.fctr.predict.Conditional.X.no.rnorm.rf.N
## 1 N 1611
## 2 Y 87
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.Y
## 1 102
## 2 257
## Prediction
## Reference N Y
## N 1611 102
## Y 87 257
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.081186e-01 6.757737e-01 8.948040e-01 9.202563e-01 8.327662e-01
## AccuracyPValue McnemarPValue
## 4.677752e-23 3.085116e-01
## model_id model_method
## 1 Conditional.X.no.rnorm.rf rf
## feats
## 1 WordCount.log, PubDate.hour, H.is.question, PubDate.apm.fctr, A.can, S.can, H.has.ebola, S.make, A.make, S.one, S.state, A.state, A.one, S.said, A.said, PubDate.date.fctr, PubDate.second, S.presid, A.presid, S.take, A.take, PubDate.minute, S.new, A.new, PubDate.wkday.fctr, S.day, A.day, H.X2014, S.show, A.show, S.report, A.report, S.share, A.share, S.year, A.year, S.compani, A.compani, H.new, S.first, A.first, S.time, A.time, H.newyork, S.articl, A.articl, S.will, A.will, H.day, S.newyork, A.newyork, H.today, H.report, S.intern, A.intern, H.week, H.fashion, S.week, A.week, S.fashion, A.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, NewsDesk.nb.fctr, H.num.words.log, H.num.words.unq.log, SubsectionName.nb.fctr, A.num.chars.log, S.num.chars.log, A.num.words.log, S.num.words.log, A.num.words.unq.log, S.num.words.unq.log
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 3 296.353 74.869
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 1 0.6 1 0.9099441
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.999176 1 0.6561726 0.9314323
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.4 0.7311522 0.9081186
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.894804 0.9202563 0.6757737
# User specified
# easier to exclude features
#model_id_pfx <- "";
# indep_vars_vctr <- setdiff(names(glb_fitent_df),
# union(union(glb_rsp_var, glb_exclude_vars_as_features),
# c("<feat1_name>", "<feat2_name>")))
# method <- ""
# easier to include features
#model_id_pfx <- ""; indep_vars_vctr <- c("<feat1_name>", "<feat1_name>"); method <- ""
# User specified bivariate models
# indep_vars_vctr_lst <- list()
# for (feat in setdiff(names(glb_fitent_df),
# union(glb_rsp_var, glb_exclude_vars_as_features)))
# indep_vars_vctr_lst[["feat"]] <- feat
# User specified combinatorial models
# indep_vars_vctr_lst <- list()
# combn_mtrx <- combn(c("<feat1_name>", "<feat2_name>", "<featn_name>"),
# <num_feats_to_choose>)
# for (combn_ix in 1:ncol(combn_mtrx))
# #print(combn_mtrx[, combn_ix])
# indep_vars_vctr_lst[[combn_ix]] <- combn_mtrx[, combn_ix]
# template for myfit_mdl
# rf is hard-coded in caret to recognize only Accuracy / Kappa evaluation metrics
# only for OOB in trainControl ?
# ret_lst <- myfit_mdl_fn(model_id=paste0(model_id_pfx, ""), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitent_df, OOB_df=glb_OOBent_df,
# n_cv_folds=glb_n_cv_folds, tune_models_df=glb_tune_models_df,
# model_loss_mtrx=glb_model_metric_terms,
# model_summaryFunction=glb_model_metric_smmry,
# model_metric=glb_model_metric,
# model_metric_maximize=glb_model_metric_maximize)
# Simplify a model
# fit_df <- glb_fitent_df; glb_mdl <- step(<complex>_mdl)
# Non-caret models
# rpart_area_mdl <- rpart(reformulate("Area", response=glb_rsp_var),
# data=glb_fitent_df, #method="class",
# control=rpart.control(cp=0.12),
# parms=list(loss=glb_model_metric_terms))
# print("rpart_sel_wlm_mdl"); prp(rpart_sel_wlm_mdl)
#
print(glb_models_df)
## model_id model_method
## 1 MFO.myMFO_classfr myMFO_classfr
## 2 Random.myrandom_classfr myrandom_classfr
## 3 Max.cor.Y.cv.0.rpart rpart
## 4 Max.cor.Y.cv.0.cp.0.rpart rpart
## 5 Max.cor.Y.rpart rpart
## 6 Max.cor.Y.glm glm
## 7 Interact.High.cor.Y.glm glm
## 8 Low.cor.X.glm glm
## 9 Conditional.X.glm glm
## 10 Conditional.X.no.rnorm.rpart rpart
## 11 Conditional.X.no.rnorm.rf rf
## feats
## 1 .rnorm
## 2 .rnorm
## 3 WordCount.log
## 4 WordCount.log
## 5 WordCount.log
## 6 WordCount.log
## 7 WordCount.log, WordCount.log:PubDate.apm.fctr, WordCount.log:S.can, WordCount.log:S.make, WordCount.log:S.presid, WordCount.log:S.take, WordCount.log:S.new, WordCount.log:S.day, WordCount.log:S.show, WordCount.log:S.report, WordCount.log:S.share, WordCount.log:S.year, WordCount.log:S.compani, WordCount.log:S.first, WordCount.log:S.time, WordCount.log:S.articl, WordCount.log:S.will, WordCount.log:S.newyork, WordCount.log:S.intern, WordCount.log:H.week, WordCount.log:S.week, WordCount.log:S.fashion, WordCount.log:SectionName.nb.fctr, WordCount.log:H.num.chars.log, WordCount.log:NewsDesk.nb.fctr, WordCount.log:A.num.chars.log, WordCount.log:A.num.words.log, WordCount.log:S.num.chars.log
## 8 WordCount.log, H.is.question, PubDate.apm.fctr, S.can, H.has.ebola, S.make, S.one, S.state, A.state, A.one, S.said, A.said, .rnorm, PubDate.date.fctr, PubDate.second, S.presid, S.take, PubDate.minute, S.new, PubDate.wkday.fctr, S.day, H.X2014, S.show, S.report, S.share, S.year, S.compani, H.new, S.first, S.time, H.newyork, S.articl, S.will, H.day, S.newyork, H.today, H.report, S.intern, H.week, S.week, S.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, H.num.words.log, A.num.chars.log, A.num.words.log, A.num.words.unq.log
## 9 WordCount.log, PubDate.hour, H.is.question, PubDate.apm.fctr, A.can, S.can, H.has.ebola, S.make, A.make, S.one, S.state, A.state, A.one, S.said, A.said, .rnorm, PubDate.date.fctr, PubDate.second, S.presid, A.presid, S.take, A.take, PubDate.minute, S.new, A.new, PubDate.wkday.fctr, S.day, A.day, H.X2014, S.show, A.show, S.report, A.report, S.share, A.share, S.year, A.year, S.compani, A.compani, H.new, S.first, A.first, S.time, A.time, H.newyork, S.articl, A.articl, S.will, A.will, H.day, S.newyork, A.newyork, H.today, H.report, S.intern, A.intern, H.week, H.fashion, S.week, A.week, S.fashion, A.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, NewsDesk.nb.fctr, H.num.words.log, H.num.words.unq.log, SubsectionName.nb.fctr, A.num.chars.log, S.num.chars.log, A.num.words.log, S.num.words.log, A.num.words.unq.log, S.num.words.unq.log
## 10 WordCount.log, PubDate.hour, H.is.question, PubDate.apm.fctr, A.can, S.can, H.has.ebola, S.make, A.make, S.one, S.state, A.state, A.one, S.said, A.said, PubDate.date.fctr, PubDate.second, S.presid, A.presid, S.take, A.take, PubDate.minute, S.new, A.new, PubDate.wkday.fctr, S.day, A.day, H.X2014, S.show, A.show, S.report, A.report, S.share, A.share, S.year, A.year, S.compani, A.compani, H.new, S.first, A.first, S.time, A.time, H.newyork, S.articl, A.articl, S.will, A.will, H.day, S.newyork, A.newyork, H.today, H.report, S.intern, A.intern, H.week, H.fashion, S.week, A.week, S.fashion, A.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, NewsDesk.nb.fctr, H.num.words.log, H.num.words.unq.log, SubsectionName.nb.fctr, A.num.chars.log, S.num.chars.log, A.num.words.log, S.num.words.log, A.num.words.unq.log, S.num.words.unq.log
## 11 WordCount.log, PubDate.hour, H.is.question, PubDate.apm.fctr, A.can, S.can, H.has.ebola, S.make, A.make, S.one, S.state, A.state, A.one, S.said, A.said, PubDate.date.fctr, PubDate.second, S.presid, A.presid, S.take, A.take, PubDate.minute, S.new, A.new, PubDate.wkday.fctr, S.day, A.day, H.X2014, S.show, A.show, S.report, A.report, S.share, A.share, S.year, A.year, S.compani, A.compani, H.new, S.first, A.first, S.time, A.time, H.newyork, S.articl, A.articl, S.will, A.will, H.day, S.newyork, A.newyork, H.today, H.report, S.intern, A.intern, H.week, H.fashion, S.week, A.week, S.fashion, A.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, NewsDesk.nb.fctr, H.num.words.log, H.num.words.unq.log, SubsectionName.nb.fctr, A.num.chars.log, S.num.chars.log, A.num.words.log, S.num.words.log, A.num.words.unq.log, S.num.words.unq.log
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 0 0.694 0.003
## 2 0 0.322 0.002
## 3 0 0.703 0.066
## 4 0 0.591 0.058
## 5 3 1.288 0.066
## 6 1 1.161 0.075
## 7 1 6.445 1.862
## 8 1 8.294 2.392
## 9 1 25.178 7.683
## 10 3 10.320 1.823
## 11 3 296.353 74.869
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5000000 0.5 0.0000000 0.8326257
## 2 0.4975446 0.1 0.2867534 0.1673743
## 3 0.5000000 0.5 0.0000000 0.8326257
## 4 0.7074274 0.2 0.4572127 0.8015642
## 5 0.5000000 0.5 0.0000000 0.8174308
## 6 0.7301738 0.2 0.4222222 0.8306149
## 7 0.9117549 0.3 0.6837945 0.7265090
## 8 0.9282722 0.2 0.6859688 0.8867060
## 9 0.8204389 0.9 0.7146814 0.7562325
## 10 0.6662843 0.7 0.4808549 0.8744112
## 11 1.0000000 0.6 1.0000000 0.9099441
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.8213602 0.8434553 0.00000000 0.5000000
## 2 0.1565447 0.1786398 0.00000000 0.4821958
## 3 0.8213602 0.8434553 0.00000000 0.5000000
## 4 0.7895723 0.8131613 0.33685715 0.6504263
## 5 0.8213602 0.8434553 0.06210715 0.5000000
## 6 0.6959502 0.7227703 0.01169498 0.7342331
## 7 0.8833016 0.9016572 0.29915157 0.9127805
## 8 0.8638863 0.8835558 0.55652112 0.9144538
## 9 0.8990819 0.9162474 0.39825605 0.8173314
## 10 0.8595050 0.8794511 0.44056055 0.6862731
## 11 0.9991760 1.0000000 0.65617265 0.9314323
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0.0000000 0.8327662
## 2 0.1 0.2865473 0.1672338
## 3 0.5 0.0000000 0.8327662
## 4 0.2 0.3799472 0.7715119
## 5 0.5 0.0000000 0.8327662
## 6 0.2 0.3961722 0.6932426
## 7 0.3 0.6915352 0.8954789
## 8 0.3 0.6675603 0.8794361
## 9 0.9 0.7032641 0.9027710
## 10 0.7 0.5176909 0.8740885
## 11 0.4 0.7311522 0.9081186
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.8159247 0.8486533 0.0000000
## 2 0.1513467 0.1840753 0.0000000
## 3 0.8159247 0.8486533 0.0000000
## 4 0.7527449 0.7895019 0.2413609
## 5 0.8159247 0.8486533 0.0000000
## 6 0.6728061 0.7131270 0.2215049
## 7 0.8814466 0.9083722 0.6286271
## 8 0.8645784 0.8932025 0.5944691
## 9 0.8891448 0.9152364 0.6451546
## 10 0.8589743 0.8881272 0.4517912
## 11 0.8948040 0.9202563 0.6757737
## max.AccuracySD.fit max.KappaSD.fit min.aic.fit
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 0.002468564 0.018885476 NA
## 6 0.001627400 0.007870559 3674.923
## 7 0.253696030 0.264729843 2504.643
## 8 0.008518875 0.029846075 2443.257
## 9 0.238751463 0.357486676 29927.970
## 10 0.014254671 0.073840685 NA
## 11 NA NA NA
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 10 fit.models 6 1 214.381 565.526 351.145
## 11 fit.models 6 2 565.526 NA NA
if (!is.null(glb_model_metric_smmry)) {
stats_df <- glb_models_df[, "model_id", FALSE]
stats_mdl_df <- data.frame()
for (model_id in stats_df$model_id) {
stats_mdl_df <- rbind(stats_mdl_df,
mypredict_mdl(glb_models_lst[[model_id]], glb_fitent_df, glb_rsp_var,
glb_rsp_var_out, model_id, "fit",
glb_model_metric_smmry, glb_model_metric,
glb_model_metric_maximize, ret_type="stats"))
}
stats_df <- merge(stats_df, stats_mdl_df, all.x=TRUE)
stats_mdl_df <- data.frame()
for (model_id in stats_df$model_id) {
stats_mdl_df <- rbind(stats_mdl_df,
mypredict_mdl(glb_models_lst[[model_id]], glb_OOBent_df, glb_rsp_var,
glb_rsp_var_out, model_id, "OOB",
glb_model_metric_smmry, glb_model_metric,
glb_model_metric_maximize, ret_type="stats"))
}
stats_df <- merge(stats_df, stats_mdl_df, all.x=TRUE)
# tmp_models_df <- orderBy(~model_id, glb_models_df)
# rownames(tmp_models_df) <- seq(1, nrow(tmp_models_df))
# all.equal(subset(tmp_models_df[, names(stats_df)], model_id != "Random.myrandom_classfr"),
# subset(stats_df, model_id != "Random.myrandom_classfr"))
# print(subset(tmp_models_df[, names(stats_df)], model_id != "Random.myrandom_classfr")[, c("model_id", "max.Accuracy.fit")])
# print(subset(stats_df, model_id != "Random.myrandom_classfr")[, c("model_id", "max.Accuracy.fit")])
print("Merging following data into glb_models_df:")
print(stats_mrg_df <- stats_df[, c(1, grep(glb_model_metric, names(stats_df)))])
print(tmp_models_df <- orderBy(~model_id, glb_models_df[, c("model_id", grep(glb_model_metric, names(stats_df), value=TRUE))]))
tmp2_models_df <- glb_models_df[, c("model_id", setdiff(names(glb_models_df), grep(glb_model_metric, names(stats_df), value=TRUE)))]
tmp3_models_df <- merge(tmp2_models_df, stats_mrg_df, all.x=TRUE, sort=FALSE)
print(tmp3_models_df)
print(names(tmp3_models_df))
print(glb_models_df <- subset(tmp3_models_df, select=-model_id.1))
}
plt_models_df <- glb_models_df[, -grep("SD|Upper|Lower", names(glb_models_df))]
for (var in grep("^min.", names(plt_models_df), value=TRUE)) {
plt_models_df[, sub("min.", "inv.", var)] <-
#ifelse(all(is.na(tmp <- plt_models_df[, var])), NA, 1.0 / tmp)
1.0 / plt_models_df[, var]
plt_models_df <- plt_models_df[ , -grep(var, names(plt_models_df))]
}
print(plt_models_df)
## model_id model_method
## 1 MFO.myMFO_classfr myMFO_classfr
## 2 Random.myrandom_classfr myrandom_classfr
## 3 Max.cor.Y.cv.0.rpart rpart
## 4 Max.cor.Y.cv.0.cp.0.rpart rpart
## 5 Max.cor.Y.rpart rpart
## 6 Max.cor.Y.glm glm
## 7 Interact.High.cor.Y.glm glm
## 8 Low.cor.X.glm glm
## 9 Conditional.X.glm glm
## 10 Conditional.X.no.rnorm.rpart rpart
## 11 Conditional.X.no.rnorm.rf rf
## feats
## 1 .rnorm
## 2 .rnorm
## 3 WordCount.log
## 4 WordCount.log
## 5 WordCount.log
## 6 WordCount.log
## 7 WordCount.log, WordCount.log:PubDate.apm.fctr, WordCount.log:S.can, WordCount.log:S.make, WordCount.log:S.presid, WordCount.log:S.take, WordCount.log:S.new, WordCount.log:S.day, WordCount.log:S.show, WordCount.log:S.report, WordCount.log:S.share, WordCount.log:S.year, WordCount.log:S.compani, WordCount.log:S.first, WordCount.log:S.time, WordCount.log:S.articl, WordCount.log:S.will, WordCount.log:S.newyork, WordCount.log:S.intern, WordCount.log:H.week, WordCount.log:S.week, WordCount.log:S.fashion, WordCount.log:SectionName.nb.fctr, WordCount.log:H.num.chars.log, WordCount.log:NewsDesk.nb.fctr, WordCount.log:A.num.chars.log, WordCount.log:A.num.words.log, WordCount.log:S.num.chars.log
## 8 WordCount.log, H.is.question, PubDate.apm.fctr, S.can, H.has.ebola, S.make, S.one, S.state, A.state, A.one, S.said, A.said, .rnorm, PubDate.date.fctr, PubDate.second, S.presid, S.take, PubDate.minute, S.new, PubDate.wkday.fctr, S.day, H.X2014, S.show, S.report, S.share, S.year, S.compani, H.new, S.first, S.time, H.newyork, S.articl, S.will, H.day, S.newyork, H.today, H.report, S.intern, H.week, S.week, S.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, H.num.words.log, A.num.chars.log, A.num.words.log, A.num.words.unq.log
## 9 WordCount.log, PubDate.hour, H.is.question, PubDate.apm.fctr, A.can, S.can, H.has.ebola, S.make, A.make, S.one, S.state, A.state, A.one, S.said, A.said, .rnorm, PubDate.date.fctr, PubDate.second, S.presid, A.presid, S.take, A.take, PubDate.minute, S.new, A.new, PubDate.wkday.fctr, S.day, A.day, H.X2014, S.show, A.show, S.report, A.report, S.share, A.share, S.year, A.year, S.compani, A.compani, H.new, S.first, A.first, S.time, A.time, H.newyork, S.articl, A.articl, S.will, A.will, H.day, S.newyork, A.newyork, H.today, H.report, S.intern, A.intern, H.week, H.fashion, S.week, A.week, S.fashion, A.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, NewsDesk.nb.fctr, H.num.words.log, H.num.words.unq.log, SubsectionName.nb.fctr, A.num.chars.log, S.num.chars.log, A.num.words.log, S.num.words.log, A.num.words.unq.log, S.num.words.unq.log
## 10 WordCount.log, PubDate.hour, H.is.question, PubDate.apm.fctr, A.can, S.can, H.has.ebola, S.make, A.make, S.one, S.state, A.state, A.one, S.said, A.said, PubDate.date.fctr, PubDate.second, S.presid, A.presid, S.take, A.take, PubDate.minute, S.new, A.new, PubDate.wkday.fctr, S.day, A.day, H.X2014, S.show, A.show, S.report, A.report, S.share, A.share, S.year, A.year, S.compani, A.compani, H.new, S.first, A.first, S.time, A.time, H.newyork, S.articl, A.articl, S.will, A.will, H.day, S.newyork, A.newyork, H.today, H.report, S.intern, A.intern, H.week, H.fashion, S.week, A.week, S.fashion, A.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, NewsDesk.nb.fctr, H.num.words.log, H.num.words.unq.log, SubsectionName.nb.fctr, A.num.chars.log, S.num.chars.log, A.num.words.log, S.num.words.log, A.num.words.unq.log, S.num.words.unq.log
## 11 WordCount.log, PubDate.hour, H.is.question, PubDate.apm.fctr, A.can, S.can, H.has.ebola, S.make, A.make, S.one, S.state, A.state, A.one, S.said, A.said, PubDate.date.fctr, PubDate.second, S.presid, A.presid, S.take, A.take, PubDate.minute, S.new, A.new, PubDate.wkday.fctr, S.day, A.day, H.X2014, S.show, A.show, S.report, A.report, S.share, A.share, S.year, A.year, S.compani, A.compani, H.new, S.first, A.first, S.time, A.time, H.newyork, S.articl, A.articl, S.will, A.will, H.day, S.newyork, A.newyork, H.today, H.report, S.intern, A.intern, H.week, H.fashion, S.week, A.week, S.fashion, A.fashion, Headline.pfx.fctr, SectionName.nb.fctr, H.num.chars.log, NewsDesk.nb.fctr, H.num.words.log, H.num.words.unq.log, SubsectionName.nb.fctr, A.num.chars.log, S.num.chars.log, A.num.words.log, S.num.words.log, A.num.words.unq.log, S.num.words.unq.log
## max.nTuningRuns max.auc.fit opt.prob.threshold.fit max.f.score.fit
## 1 0 0.5000000 0.5 0.0000000
## 2 0 0.4975446 0.1 0.2867534
## 3 0 0.5000000 0.5 0.0000000
## 4 0 0.7074274 0.2 0.4572127
## 5 3 0.5000000 0.5 0.0000000
## 6 1 0.7301738 0.2 0.4222222
## 7 1 0.9117549 0.3 0.6837945
## 8 1 0.9282722 0.2 0.6859688
## 9 1 0.8204389 0.9 0.7146814
## 10 3 0.6662843 0.7 0.4808549
## 11 3 1.0000000 0.6 1.0000000
## max.Accuracy.fit max.Kappa.fit max.auc.OOB opt.prob.threshold.OOB
## 1 0.8326257 0.00000000 0.5000000 0.5
## 2 0.1673743 0.00000000 0.4821958 0.1
## 3 0.8326257 0.00000000 0.5000000 0.5
## 4 0.8015642 0.33685715 0.6504263 0.2
## 5 0.8174308 0.06210715 0.5000000 0.5
## 6 0.8306149 0.01169498 0.7342331 0.2
## 7 0.7265090 0.29915157 0.9127805 0.3
## 8 0.8867060 0.55652112 0.9144538 0.3
## 9 0.7562325 0.39825605 0.8173314 0.9
## 10 0.8744112 0.44056055 0.6862731 0.7
## 11 0.9099441 0.65617265 0.9314323 0.4
## max.f.score.OOB max.Accuracy.OOB max.Kappa.OOB
## 1 0.0000000 0.8327662 0.0000000
## 2 0.2865473 0.1672338 0.0000000
## 3 0.0000000 0.8327662 0.0000000
## 4 0.3799472 0.7715119 0.2413609
## 5 0.0000000 0.8327662 0.0000000
## 6 0.3961722 0.6932426 0.2215049
## 7 0.6915352 0.8954789 0.6286271
## 8 0.6675603 0.8794361 0.5944691
## 9 0.7032641 0.9027710 0.6451546
## 10 0.5176909 0.8740885 0.4517912
## 11 0.7311522 0.9081186 0.6757737
## inv.elapsedtime.everything inv.elapsedtime.final inv.aic.fit
## 1 1.440922190 333.33333333 NA
## 2 3.105590062 500.00000000 NA
## 3 1.422475107 15.15151515 NA
## 4 1.692047377 17.24137931 NA
## 5 0.776397516 15.15151515 NA
## 6 0.861326443 13.33333333 2.721146e-04
## 7 0.155159038 0.53705693 3.992585e-04
## 8 0.120569086 0.41806020 4.092898e-04
## 9 0.039717213 0.13015749 3.341356e-05
## 10 0.096899225 0.54854635 NA
## 11 0.003374354 0.01335666 NA
print(myplot_radar(radar_inp_df=plt_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 11. Consider specifying shapes manually. if you must have them.
## Warning: Removed 5 rows containing missing values (geom_path).
## Warning: Removed 74 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing missing values (geom_text).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 11. Consider specifying shapes manually. if you must have them.
# print(myplot_radar(radar_inp_df=subset(plt_models_df,
# !(model_id %in% grep("random|MFO", plt_models_df$model_id, value=TRUE)))))
# Compute CI for <metric>SD
glb_models_df <- mutate(glb_models_df,
max.df = ifelse(max.nTuningRuns > 1, max.nTuningRuns - 1, NA),
min.sd2ci.scaler = ifelse(is.na(max.df), NA, qt(0.975, max.df)))
for (var in grep("SD", names(glb_models_df), value=TRUE)) {
# Does CI alredy exist ?
var_components <- unlist(strsplit(var, "SD"))
varActul <- paste0(var_components[1], var_components[2])
varUpper <- paste0(var_components[1], "Upper", var_components[2])
varLower <- paste0(var_components[1], "Lower", var_components[2])
if (varUpper %in% names(glb_models_df)) {
warning(varUpper, " already exists in glb_models_df")
# Assuming Lower also exists
next
}
print(sprintf("var:%s", var))
# CI is dependent on sample size in t distribution; df=n-1
glb_models_df[, varUpper] <- glb_models_df[, varActul] +
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
glb_models_df[, varLower] <- glb_models_df[, varActul] -
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
}
## Warning: max.AccuracyUpper.fit already exists in glb_models_df
## [1] "var:max.KappaSD.fit"
# Plot metrics with CI
plt_models_df <- glb_models_df[, "model_id", FALSE]
pltCI_models_df <- glb_models_df[, "model_id", FALSE]
for (var in grep("Upper", names(glb_models_df), value=TRUE)) {
var_components <- unlist(strsplit(var, "Upper"))
col_name <- unlist(paste(var_components, collapse=""))
plt_models_df[, col_name] <- glb_models_df[, col_name]
for (name in paste0(var_components[1], c("Upper", "Lower"), var_components[2]))
pltCI_models_df[, name] <- glb_models_df[, name]
}
build_statsCI_data <- function(plt_models_df) {
mltd_models_df <- melt(plt_models_df, id.vars="model_id")
mltd_models_df$data <- sapply(1:nrow(mltd_models_df),
function(row_ix) tail(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]), "[.]")), 1))
mltd_models_df$label <- sapply(1:nrow(mltd_models_df),
function(row_ix) head(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]),
paste0(".", mltd_models_df[row_ix, "data"]))), 1))
#print(mltd_models_df)
return(mltd_models_df)
}
mltd_models_df <- build_statsCI_data(plt_models_df)
mltdCI_models_df <- melt(pltCI_models_df, id.vars="model_id")
for (row_ix in 1:nrow(mltdCI_models_df)) {
for (type in c("Upper", "Lower")) {
if (length(var_components <- unlist(strsplit(
as.character(mltdCI_models_df[row_ix, "variable"]), type))) > 1) {
#print(sprintf("row_ix:%d; type:%s; ", row_ix, type))
mltdCI_models_df[row_ix, "label"] <- var_components[1]
mltdCI_models_df[row_ix, "data"] <-
unlist(strsplit(var_components[2], "[.]"))[2]
mltdCI_models_df[row_ix, "type"] <- type
break
}
}
}
#print(mltdCI_models_df)
# castCI_models_df <- dcast(mltdCI_models_df, value ~ type, fun.aggregate=sum)
# print(castCI_models_df)
wideCI_models_df <- reshape(subset(mltdCI_models_df, select=-variable),
timevar="type",
idvar=setdiff(names(mltdCI_models_df), c("type", "value", "variable")),
direction="wide")
#print(wideCI_models_df)
mrgdCI_models_df <- merge(wideCI_models_df, mltd_models_df, all.x=TRUE)
#print(mrgdCI_models_df)
# Merge stats back in if CIs don't exist
goback_vars <- c()
for (var in unique(mltd_models_df$label)) {
for (type in unique(mltd_models_df$data)) {
var_type <- paste0(var, ".", type)
# if this data is already present, next
if (var_type %in% unique(paste(mltd_models_df$label, mltd_models_df$data,
sep=".")))
next
#print(sprintf("var_type:%s", var_type))
goback_vars <- c(goback_vars, var_type)
}
}
if (length(goback_vars) > 0) {
mltd_goback_df <- build_statsCI_data(glb_models_df[, c("model_id", goback_vars)])
mltd_models_df <- rbind(mltd_models_df, mltd_goback_df)
}
mltd_models_df <- merge(mltd_models_df, glb_models_df[, c("model_id", "model_method")],
all.x=TRUE)
png(paste0(glb_out_pfx, "models_bar.png"), width=480*3, height=480*2)
print(gp <- myplot_bar(mltd_models_df, "model_id", "value", colorcol_name="model_method") +
geom_errorbar(data=mrgdCI_models_df,
mapping=aes(x=model_id, ymax=value.Upper, ymin=value.Lower), width=0.5) +
facet_grid(label ~ data, scales="free") +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5)))
dev.off()
## quartz_off_screen
## 2
print(gp)
# used for console inspection
model_evl_terms <- c(NULL)
for (metric in glb_model_evl_criteria)
model_evl_terms <- c(model_evl_terms,
ifelse(length(grep("max", metric)) > 0, "-", "+"), metric)
if (glb_is_classification && glb_is_binomial)
model_evl_terms <- c(model_evl_terms, "-", "opt.prob.threshold.OOB")
model_sel_frmla <- as.formula(paste(c("~ ", model_evl_terms), collapse=" "))
print(dsp_models_df <- orderBy(model_sel_frmla, glb_models_df)
[, c("model_id", glb_model_evl_criteria,
ifelse(glb_is_classification && glb_is_binomial,
"opt.prob.threshold.OOB", NULL))])
## model_id max.Accuracy.OOB max.auc.OOB max.Kappa.OOB
## 11 Conditional.X.no.rnorm.rf 0.9081186 0.9314323 0.6757737
## 9 Conditional.X.glm 0.9027710 0.8173314 0.6451546
## 7 Interact.High.cor.Y.glm 0.8954789 0.9127805 0.6286271
## 8 Low.cor.X.glm 0.8794361 0.9144538 0.5944691
## 10 Conditional.X.no.rnorm.rpart 0.8740885 0.6862731 0.4517912
## 1 MFO.myMFO_classfr 0.8327662 0.5000000 0.0000000
## 3 Max.cor.Y.cv.0.rpart 0.8327662 0.5000000 0.0000000
## 5 Max.cor.Y.rpart 0.8327662 0.5000000 0.0000000
## 4 Max.cor.Y.cv.0.cp.0.rpart 0.7715119 0.6504263 0.2413609
## 6 Max.cor.Y.glm 0.6932426 0.7342331 0.2215049
## 2 Random.myrandom_classfr 0.1672338 0.4821958 0.0000000
## min.aic.fit opt.prob.threshold.OOB
## 11 NA 0.4
## 9 29927.970 0.9
## 7 2504.643 0.3
## 8 2443.257 0.3
## 10 NA 0.7
## 1 NA 0.5
## 3 NA 0.5
## 5 NA 0.5
## 4 NA 0.2
## 6 3674.923 0.2
## 2 NA 0.1
print(myplot_radar(radar_inp_df=dsp_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 11. Consider specifying shapes manually. if you must have them.
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing missing values (geom_text).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 11. Consider specifying shapes manually. if you must have them.
print("Metrics used for model selection:"); print(model_sel_frmla)
## [1] "Metrics used for model selection:"
## ~-max.Accuracy.OOB - max.auc.OOB - max.Kappa.OOB + min.aic.fit -
## opt.prob.threshold.OOB
print(sprintf("Best model id: %s", dsp_models_df[1, "model_id"]))
## [1] "Best model id: Conditional.X.no.rnorm.rf"
if (is.null(glb_sel_mdl_id))
{ glb_sel_mdl_id <- dsp_models_df[1, "model_id"] } else
print(sprintf("User specified selection: %s", glb_sel_mdl_id))
myprint_mdl(glb_sel_mdl <- glb_models_lst[[glb_sel_mdl_id]])
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 4475 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 8950 matrix numeric
## oob.times 4475 -none- numeric
## classes 2 -none- character
## importance 182 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 4475 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 182 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] TRUE
# From here to save(), this should all be in one function
# these are executed in the same seq twice more:
# fit.data.training & predict.data.new chunks
glb_get_predictions <- function(df, mdl_id, rsp_var_out, prob_threshold_def=NULL) {
mdl <- glb_models_lst[[mdl_id]]
rsp_var_out <- paste0(rsp_var_out, mdl_id)
if (glb_is_regression) {
df[, rsp_var_out] <- predict(mdl, newdata=df, type="raw")
print(myplot_scatter(df, glb_rsp_var, rsp_var_out, smooth=TRUE))
df[, paste0(rsp_var_out, ".err")] <-
abs(df[, rsp_var_out] - df[, glb_rsp_var])
print(head(orderBy(reformulate(c("-", paste0(glb_rsp_var_out, ".err"))),
df)))
}
if (glb_is_classification && glb_is_binomial) {
prob_threshold <- glb_models_df[glb_models_df$model_id == mdl_id,
"opt.prob.threshold.OOB"]
if (is.null(prob_threshold) || is.na(prob_threshold)) {
warning("Using default probability threshold: ", prob_threshold_def)
if (is.null(prob_threshold <- prob_threshold_def))
stop("Default probability threshold is NULL")
}
df[, paste0(rsp_var_out, ".prob")] <-
predict(mdl, newdata=df, type="prob")[, 2]
df[, rsp_var_out] <-
factor(levels(df[, glb_rsp_var])[
(df[, paste0(rsp_var_out, ".prob")] >=
prob_threshold) * 1 + 1], levels(df[, glb_rsp_var]))
# prediction stats already reported by myfit_mdl ???
}
if (glb_is_classification && !glb_is_binomial) {
df[, rsp_var_out] <- predict(mdl, newdata=df, type="raw")
}
return(df)
}
glb_OOBent_df <- glb_get_predictions(df=glb_OOBent_df, glb_sel_mdl_id, glb_rsp_var_out)
predct_accurate_var_name <- paste0(glb_rsp_var_out, glb_sel_mdl_id, ".accurate")
glb_OOBent_df[, predct_accurate_var_name] <-
(glb_OOBent_df[, glb_rsp_var] ==
glb_OOBent_df[, paste0(glb_rsp_var_out, glb_sel_mdl_id)])
glb_feats_df <-
mymerge_feats_importance(feats_df=glb_feats_df, sel_mdl=glb_sel_mdl, glb_fitent_df)
glb_feats_df[, paste0(glb_sel_mdl_id, ".importance")] <- glb_feats_df$importance
print(glb_feats_df)
## id cor.y exclude.as.feat
## WordCount.log WordCount.log 0.265952699 FALSE
## SubsectionName.nb.fctr SubsectionName.nb.fctr -0.213860009 FALSE
## PubDate.hour PubDate.hour 0.159167673 FALSE
## H.num.chars.log H.num.chars.log -0.171062360 FALSE
## PubDate.minute PubDate.minute -0.031469083 FALSE
## SectionName.nb.fctr SectionName.nb.fctr -0.148701209 FALSE
## S.num.chars.log S.num.chars.log -0.224692967 FALSE
## A.num.chars.log A.num.chars.log -0.224548821 FALSE
## NewsDesk.nb.fctr NewsDesk.nb.fctr -0.172482671 FALSE
## PubDate.second PubDate.second -0.012253600 FALSE
## H.num.words.log H.num.words.log -0.200686356 FALSE
## Headline.pfx.fctr Headline.pfx.fctr -0.100052879 FALSE
## S.num.words.unq.log S.num.words.unq.log -0.250796919 FALSE
## H.num.words.unq.log H.num.words.unq.log -0.204496360 FALSE
## A.num.words.unq.log A.num.words.unq.log -0.250601203 FALSE
## S.num.words.log S.num.words.log -0.245354135 FALSE
## A.num.words.log A.num.words.log -0.245073324 FALSE
## H.is.question H.is.question 0.129154799 FALSE
## PubDate.wkday.fctr PubDate.wkday.fctr -0.039801288 FALSE
## PubDate.apm.fctr PubDate.apm.fctr 0.101472715 FALSE
## PubDate.date.fctr PubDate.date.fctr -0.011647558 FALSE
## S.time S.time -0.057595102 FALSE
## A.time A.time -0.057790617 FALSE
## A.one A.one 0.005696039 FALSE
## S.one S.one 0.006342094 FALSE
## A.new A.new -0.035359447 FALSE
## S.new S.new -0.034948520 FALSE
## A.year A.year -0.051146178 FALSE
## S.can S.can 0.029999780 FALSE
## S.year S.year -0.051146178 FALSE
## H.day H.day -0.061669687 FALSE
## A.can A.can 0.031498867 FALSE
## S.will S.will -0.060575493 FALSE
## A.report A.report -0.050211524 FALSE
## A.will A.will -0.061025004 FALSE
## A.state A.state 0.005702163 FALSE
## S.week S.week -0.084814939 FALSE
## S.report S.report -0.050211524 FALSE
## A.said A.said 0.001363226 FALSE
## S.said S.said 0.001363226 FALSE
## S.state S.state 0.006069626 FALSE
## A.compani A.compani -0.053099633 FALSE
## S.newyork S.newyork -0.062117105 FALSE
## A.newyork A.newyork -0.062117105 FALSE
## A.week A.week -0.084814939 FALSE
## S.compani S.compani -0.053012962 FALSE
## A.take A.take -0.026086108 FALSE
## S.take S.take -0.025762398 FALSE
## A.make A.make 0.023138853 FALSE
## H.has.ebola H.has.ebola 0.025881397 FALSE
## S.make S.make 0.023138853 FALSE
## S.share S.share -0.050329686 FALSE
## A.share A.share -0.050329686 FALSE
## S.presid S.presid -0.019828826 FALSE
## S.show S.show -0.048801740 FALSE
## A.presid A.presid -0.019828826 FALSE
## A.show A.show -0.048801740 FALSE
## S.day S.day -0.045649185 FALSE
## A.day A.day -0.045909684 FALSE
## H.new H.new -0.053121542 FALSE
## S.first S.first -0.053388178 FALSE
## A.first A.first -0.053388178 FALSE
## S.intern S.intern -0.068485701 FALSE
## A.intern A.intern -0.068485701 FALSE
## H.report H.report -0.064948102 FALSE
## H.today H.today -0.063723058 FALSE
## H.week H.week -0.075105216 FALSE
## H.X2014 H.X2014 -0.046206380 FALSE
## H.newyork H.newyork -0.057970095 FALSE
## S.articl S.articl -0.059520554 FALSE
## A.articl A.articl -0.059520554 FALSE
## H.fashion H.fashion -0.081708612 FALSE
## A.fashion A.fashion -0.086446251 FALSE
## S.fashion S.fashion -0.086446251 FALSE
## .rnorm .rnorm -0.008703337 FALSE
## A.has.http A.has.http -0.013592603 FALSE
## A.num.chars A.num.chars -0.177037425 TRUE
## A.num.words A.num.words -0.204211072 TRUE
## A.num.words.unq A.num.words.unq -0.210242145 TRUE
## H.daili H.daili -0.069192975 FALSE
## H.has.http H.has.http NA FALSE
## H.num.chars H.num.chars -0.147211183 TRUE
## H.num.words H.num.words -0.186036895 TRUE
## H.num.words.unq H.num.words.unq -0.189702157 TRUE
## H.X2015 H.X2015 -0.066584892 FALSE
## Popular Popular 1.000000000 TRUE
## Popular.fctr Popular.fctr NA TRUE
## PubDate.month.fctr PubDate.month.fctr 0.019148739 TRUE
## PubDate.year PubDate.year NA TRUE
## S.has.http S.has.http NA FALSE
## S.num.chars S.num.chars -0.179331806 TRUE
## S.num.words S.num.words -0.206385049 TRUE
## S.num.words.unq S.num.words.unq -0.212102717 TRUE
## UniqueID UniqueID 0.011824920 TRUE
## WordCount WordCount 0.257526549 TRUE
## cor.y.abs cor.high.X is.ConditionalX.y
## WordCount.log 0.265952699 <NA> TRUE
## SubsectionName.nb.fctr 0.213860009 NewsDesk.nb.fctr TRUE
## PubDate.hour 0.159167673 PubDate.apm.fctr TRUE
## H.num.chars.log 0.171062360 <NA> TRUE
## PubDate.minute 0.031469083 <NA> TRUE
## SectionName.nb.fctr 0.148701209 <NA> TRUE
## S.num.chars.log 0.224692967 A.num.chars.log TRUE
## A.num.chars.log 0.224548821 <NA> TRUE
## NewsDesk.nb.fctr 0.172482671 SectionName.nb.fctr TRUE
## PubDate.second 0.012253600 <NA> TRUE
## H.num.words.log 0.200686356 <NA> TRUE
## Headline.pfx.fctr 0.100052879 <NA> TRUE
## S.num.words.unq.log 0.250796919 S.num.chars.log TRUE
## H.num.words.unq.log 0.204496360 H.num.chars.log TRUE
## A.num.words.unq.log 0.250601203 <NA> TRUE
## S.num.words.log 0.245354135 A.num.words.log TRUE
## A.num.words.log 0.245073324 <NA> TRUE
## H.is.question 0.129154799 <NA> TRUE
## PubDate.wkday.fctr 0.039801288 <NA> TRUE
## PubDate.apm.fctr 0.101472715 <NA> TRUE
## PubDate.date.fctr 0.011647558 <NA> TRUE
## S.time 0.057595102 <NA> TRUE
## A.time 0.057790617 S.time TRUE
## A.one 0.005696039 <NA> TRUE
## S.one 0.006342094 <NA> TRUE
## A.new 0.035359447 S.new TRUE
## S.new 0.034948520 <NA> TRUE
## A.year 0.051146178 S.year TRUE
## S.can 0.029999780 <NA> TRUE
## S.year 0.051146178 <NA> TRUE
## H.day 0.061669687 <NA> TRUE
## A.can 0.031498867 S.can TRUE
## S.will 0.060575493 <NA> TRUE
## A.report 0.050211524 S.report TRUE
## A.will 0.061025004 S.will TRUE
## A.state 0.005702163 <NA> TRUE
## S.week 0.084814939 <NA> TRUE
## S.report 0.050211524 <NA> TRUE
## A.said 0.001363226 <NA> TRUE
## S.said 0.001363226 <NA> TRUE
## S.state 0.006069626 <NA> TRUE
## A.compani 0.053099633 S.compani TRUE
## S.newyork 0.062117105 <NA> TRUE
## A.newyork 0.062117105 S.newyork TRUE
## A.week 0.084814939 S.week TRUE
## S.compani 0.053012962 <NA> TRUE
## A.take 0.026086108 S.take TRUE
## S.take 0.025762398 <NA> TRUE
## A.make 0.023138853 S.make TRUE
## H.has.ebola 0.025881397 <NA> TRUE
## S.make 0.023138853 <NA> TRUE
## S.share 0.050329686 <NA> TRUE
## A.share 0.050329686 S.share TRUE
## S.presid 0.019828826 <NA> TRUE
## S.show 0.048801740 <NA> TRUE
## A.presid 0.019828826 S.presid TRUE
## A.show 0.048801740 S.show TRUE
## S.day 0.045649185 <NA> TRUE
## A.day 0.045909684 S.day TRUE
## H.new 0.053121542 <NA> TRUE
## S.first 0.053388178 <NA> TRUE
## A.first 0.053388178 S.first TRUE
## S.intern 0.068485701 <NA> TRUE
## A.intern 0.068485701 S.intern TRUE
## H.report 0.064948102 <NA> TRUE
## H.today 0.063723058 <NA> TRUE
## H.week 0.075105216 <NA> TRUE
## H.X2014 0.046206380 <NA> TRUE
## H.newyork 0.057970095 <NA> TRUE
## S.articl 0.059520554 <NA> TRUE
## A.articl 0.059520554 S.articl TRUE
## H.fashion 0.081708612 H.week TRUE
## A.fashion 0.086446251 S.fashion TRUE
## S.fashion 0.086446251 <NA> TRUE
## .rnorm 0.008703337 <NA> TRUE
## A.has.http 0.013592603 <NA> FALSE
## A.num.chars 0.177037425 <NA> NA
## A.num.words 0.204211072 <NA> NA
## A.num.words.unq 0.210242145 <NA> NA
## H.daili 0.069192975 <NA> FALSE
## H.has.http NA <NA> FALSE
## H.num.chars 0.147211183 <NA> NA
## H.num.words 0.186036895 <NA> NA
## H.num.words.unq 0.189702157 <NA> NA
## H.X2015 0.066584892 <NA> FALSE
## Popular 1.000000000 <NA> NA
## Popular.fctr NA <NA> NA
## PubDate.month.fctr 0.019148739 <NA> NA
## PubDate.year NA <NA> NA
## S.has.http NA <NA> FALSE
## S.num.chars 0.179331806 <NA> NA
## S.num.words 0.206385049 <NA> NA
## S.num.words.unq 0.212102717 <NA> NA
## UniqueID 0.011824920 <NA> NA
## WordCount 0.257526549 <NA> NA
## is.cor.y.abs.low rsp_var_raw id_var rsp_var
## WordCount.log FALSE FALSE NA NA
## SubsectionName.nb.fctr FALSE FALSE NA NA
## PubDate.hour FALSE FALSE NA NA
## H.num.chars.log FALSE FALSE NA NA
## PubDate.minute FALSE FALSE NA NA
## SectionName.nb.fctr FALSE FALSE NA NA
## S.num.chars.log FALSE FALSE NA NA
## A.num.chars.log FALSE FALSE NA NA
## NewsDesk.nb.fctr FALSE FALSE NA NA
## PubDate.second FALSE FALSE NA NA
## H.num.words.log FALSE FALSE NA NA
## Headline.pfx.fctr FALSE FALSE NA NA
## S.num.words.unq.log FALSE FALSE NA NA
## H.num.words.unq.log FALSE FALSE NA NA
## A.num.words.unq.log FALSE FALSE NA NA
## S.num.words.log FALSE FALSE NA NA
## A.num.words.log FALSE FALSE NA NA
## H.is.question FALSE FALSE NA NA
## PubDate.wkday.fctr FALSE FALSE NA NA
## PubDate.apm.fctr FALSE FALSE NA NA
## PubDate.date.fctr FALSE FALSE NA NA
## S.time FALSE FALSE NA NA
## A.time FALSE FALSE NA NA
## A.one TRUE FALSE NA NA
## S.one TRUE FALSE NA NA
## A.new FALSE FALSE NA NA
## S.new FALSE FALSE NA NA
## A.year FALSE FALSE NA NA
## S.can FALSE FALSE NA NA
## S.year FALSE FALSE NA NA
## H.day FALSE FALSE NA NA
## A.can FALSE FALSE NA NA
## S.will FALSE FALSE NA NA
## A.report FALSE FALSE NA NA
## A.will FALSE FALSE NA NA
## A.state TRUE FALSE NA NA
## S.week FALSE FALSE NA NA
## S.report FALSE FALSE NA NA
## A.said TRUE FALSE NA NA
## S.said TRUE FALSE NA NA
## S.state TRUE FALSE NA NA
## A.compani FALSE FALSE NA NA
## S.newyork FALSE FALSE NA NA
## A.newyork FALSE FALSE NA NA
## A.week FALSE FALSE NA NA
## S.compani FALSE FALSE NA NA
## A.take FALSE FALSE NA NA
## S.take FALSE FALSE NA NA
## A.make FALSE FALSE NA NA
## H.has.ebola FALSE FALSE NA NA
## S.make FALSE FALSE NA NA
## S.share FALSE FALSE NA NA
## A.share FALSE FALSE NA NA
## S.presid FALSE FALSE NA NA
## S.show FALSE FALSE NA NA
## A.presid FALSE FALSE NA NA
## A.show FALSE FALSE NA NA
## S.day FALSE FALSE NA NA
## A.day FALSE FALSE NA NA
## H.new FALSE FALSE NA NA
## S.first FALSE FALSE NA NA
## A.first FALSE FALSE NA NA
## S.intern FALSE FALSE NA NA
## A.intern FALSE FALSE NA NA
## H.report FALSE FALSE NA NA
## H.today FALSE FALSE NA NA
## H.week FALSE FALSE NA NA
## H.X2014 FALSE FALSE NA NA
## H.newyork FALSE FALSE NA NA
## S.articl FALSE FALSE NA NA
## A.articl FALSE FALSE NA NA
## H.fashion FALSE FALSE NA NA
## A.fashion FALSE FALSE NA NA
## S.fashion FALSE FALSE NA NA
## .rnorm FALSE FALSE NA NA
## A.has.http FALSE FALSE NA NA
## A.num.chars FALSE FALSE NA NA
## A.num.words FALSE FALSE NA NA
## A.num.words.unq FALSE FALSE NA NA
## H.daili FALSE FALSE NA NA
## H.has.http NA FALSE NA NA
## H.num.chars FALSE FALSE NA NA
## H.num.words FALSE FALSE NA NA
## H.num.words.unq FALSE FALSE NA NA
## H.X2015 FALSE FALSE NA NA
## Popular FALSE TRUE NA NA
## Popular.fctr NA NA NA TRUE
## PubDate.month.fctr FALSE FALSE NA NA
## PubDate.year NA FALSE NA NA
## S.has.http NA FALSE NA NA
## S.num.chars FALSE FALSE NA NA
## S.num.words FALSE FALSE NA NA
## S.num.words.unq FALSE FALSE NA NA
## UniqueID FALSE FALSE TRUE NA
## WordCount FALSE FALSE NA NA
## importance Conditional.X.no.rnorm.rf.importance
## WordCount.log 100.00000000 100.00000000
## SubsectionName.nb.fctr 80.55962844 80.55962844
## PubDate.hour 50.40142882 50.40142882
## H.num.chars.log 31.50681999 31.50681999
## PubDate.minute 31.06935062 31.06935062
## SectionName.nb.fctr 30.96104916 30.96104916
## S.num.chars.log 29.83955854 29.83955854
## A.num.chars.log 27.42283005 27.42283005
## NewsDesk.nb.fctr 27.17840550 27.17840550
## PubDate.second 24.91688135 24.91688135
## H.num.words.log 12.00527830 12.00527830
## Headline.pfx.fctr 11.72217010 11.72217010
## S.num.words.unq.log 11.60835145 11.60835145
## H.num.words.unq.log 11.25894100 11.25894100
## A.num.words.unq.log 11.18544246 11.18544246
## S.num.words.log 10.97136866 10.97136866
## A.num.words.log 10.48234438 10.48234438
## H.is.question 7.54343001 7.54343001
## PubDate.wkday.fctr 5.06560307 5.06560307
## PubDate.apm.fctr 3.72443166 3.72443166
## PubDate.date.fctr 3.50248117 3.50248117
## S.time 2.28273783 2.28273783
## A.time 2.26463376 2.26463376
## A.one 2.23475110 2.23475110
## S.one 2.19710541 2.19710541
## A.new 2.12350133 2.12350133
## S.new 1.85916578 1.85916578
## A.year 1.73137113 1.73137113
## S.can 1.65012657 1.65012657
## S.year 1.62647972 1.62647972
## H.day 1.60400212 1.60400212
## A.can 1.50097929 1.50097929
## S.will 1.41958637 1.41958637
## A.report 1.38648928 1.38648928
## A.will 1.38263430 1.38263430
## A.state 1.29734430 1.29734430
## S.week 1.29638460 1.29638460
## S.report 1.25986121 1.25986121
## A.said 1.21943045 1.21943045
## S.said 1.20710524 1.20710524
## S.state 1.20583031 1.20583031
## A.compani 1.19678908 1.19678908
## S.newyork 1.15146558 1.15146558
## A.newyork 1.14950629 1.14950629
## A.week 1.13858903 1.13858903
## S.compani 1.12479942 1.12479942
## A.take 1.09050191 1.09050191
## S.take 1.00734157 1.00734157
## A.make 0.93790398 0.93790398
## H.has.ebola 0.88048418 0.88048418
## S.make 0.86990917 0.86990917
## S.share 0.83850954 0.83850954
## A.share 0.83670420 0.83670420
## S.presid 0.83648428 0.83648428
## S.show 0.79865434 0.79865434
## A.presid 0.79431490 0.79431490
## A.show 0.78546924 0.78546924
## S.day 0.74597854 0.74597854
## A.day 0.72495657 0.72495657
## H.new 0.68617761 0.68617761
## S.first 0.49663910 0.49663910
## A.first 0.44866721 0.44866721
## S.intern 0.44583700 0.44583700
## A.intern 0.42046341 0.42046341
## H.report 0.41504183 0.41504183
## H.today 0.34780584 0.34780584
## H.week 0.34413627 0.34413627
## H.X2014 0.29048530 0.29048530
## H.newyork 0.28817488 0.28817488
## S.articl 0.19414432 0.19414432
## A.articl 0.19190230 0.19190230
## H.fashion 0.13979364 0.13979364
## A.fashion 0.09319559 0.09319559
## S.fashion 0.08458414 0.08458414
## .rnorm NA NA
## A.has.http NA NA
## A.num.chars NA NA
## A.num.words NA NA
## A.num.words.unq NA NA
## H.daili NA NA
## H.has.http NA NA
## H.num.chars NA NA
## H.num.words NA NA
## H.num.words.unq NA NA
## H.X2015 NA NA
## Popular NA NA
## Popular.fctr NA NA
## PubDate.month.fctr NA NA
## PubDate.year NA NA
## S.has.http NA NA
## S.num.chars NA NA
## S.num.words NA NA
## S.num.words.unq NA NA
## UniqueID NA NA
## WordCount NA NA
# Used again in fit.data.training & predict.data.new chunks
glb_analytics_diag_plots <- function(obs_df, mdl_id, prob_threshold=NULL) {
if (length(vars <- subset(glb_feats_df, importance > 0)$id) > 5) {
warning("Limiting important feature scatter plots to 5 out of ", length(vars))
vars <- vars[1:5]
}
require(reshape2)
rsp_var_out <- paste0(glb_rsp_var_out, mdl_id)
for (var in vars) {
plot_df <- melt(obs_df, id.vars=var,
measure.vars=c(glb_rsp_var, rsp_var_out))
# if (var == "<feat_name>") print(myplot_scatter(plot_df, var, "value",
# facet_colcol_name="variable") +
# geom_vline(xintercept=<divider_val>, linetype="dotted")) else
print(myplot_scatter(plot_df, var, "value", colorcol_name="variable",
facet_colcol_name="variable", jitter=TRUE) +
guides(color=FALSE))
}
if (glb_is_regression) {
# plot_vars_df <- subset(glb_feats_df, importance >
# glb_feats_df[glb_feats_df$id == ".rnorm", "importance"])
plot_vars_df <- orderBy(~ -importance, glb_feats_df)
if (nrow(plot_vars_df) == 0)
warning("No important features in glb_fin_mdl") else
print(myplot_prediction_regression(df=obs_df,
feat_x=ifelse(nrow(plot_vars_df) > 1, plot_vars_df$id[2],
".rownames"),
feat_y=plot_vars_df$id[1],
rsp_var=glb_rsp_var, rsp_var_out=rsp_var_out,
id_vars=glb_id_vars)
# + facet_wrap(reformulate(plot_vars_df$id[2])) # if [1 or 2] is a factor
# + geom_point(aes_string(color="<col_name>.fctr")) # to color the plot
)
}
if (glb_is_classification) {
if (nrow(plot_vars_df <- subset(glb_feats_df, importance > 0)) == 0)
warning("No features in selected model are statistically important")
else print(myplot_prediction_classification(df=obs_df,
feat_x=ifelse(nrow(plot_vars_df) > 1, plot_vars_df$id[2],
".rownames"),
feat_y=plot_vars_df$id[1],
rsp_var=glb_rsp_var,
rsp_var_out=rsp_var_out,
id_vars=glb_id_vars,
prob_threshold=prob_threshold)
# + geom_hline(yintercept=<divider_val>, linetype = "dotted")
)
}
}
glb_analytics_diag_plots(obs_df=glb_OOBent_df, mdl_id=glb_sel_mdl_id,
prob_threshold=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"], NULL))
## Warning in glb_analytics_diag_plots(obs_df = glb_OOBent_df, mdl_id =
## glb_sel_mdl_id, : Limiting important feature scatter plots to 5 out of 74
## [1] "Min/Max Boundaries: "
## UniqueID Popular.fctr
## 1654 1654 N
## 6370 6370 Y
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.prob
## 1654 0.004
## 6370 0.422
## Popular.fctr.predict.Conditional.X.no.rnorm.rf
## 1654 N
## 6370 Y
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.accurate
## 1654 TRUE
## 6370 TRUE
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.error .label
## 1654 0 1654
## 6370 0 6370
## [1] "Inaccurate: "
## UniqueID Popular.fctr
## 693 693 Y
## 4721 4721 Y
## 1156 1156 Y
## 3554 3554 Y
## 92 92 Y
## 3074 3074 Y
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.prob
## 693 0.000
## 4721 0.000
## 1156 0.002
## 3554 0.002
## 92 0.004
## 3074 0.006
## Popular.fctr.predict.Conditional.X.no.rnorm.rf
## 693 N
## 4721 N
## 1156 N
## 3554 N
## 92 N
## 3074 N
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.accurate
## 693 FALSE
## 4721 FALSE
## 1156 FALSE
## 3554 FALSE
## 92 FALSE
## 3074 FALSE
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.error
## 693 -0.400
## 4721 -0.400
## 1156 -0.398
## 3554 -0.398
## 92 -0.396
## 3074 -0.394
## UniqueID Popular.fctr
## 450 450 Y
## 80 80 N
## 2054 2054 N
## 4646 4646 N
## 2682 2682 N
## 1020 1020 N
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.prob
## 450 0.282
## 80 0.416
## 2054 0.418
## 4646 0.548
## 2682 0.734
## 1020 0.892
## Popular.fctr.predict.Conditional.X.no.rnorm.rf
## 450 N
## 80 Y
## 2054 Y
## 4646 Y
## 2682 Y
## 1020 Y
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.accurate
## 450 FALSE
## 80 FALSE
## 2054 FALSE
## 4646 FALSE
## 2682 FALSE
## 1020 FALSE
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.error
## 450 -0.118
## 80 0.016
## 2054 0.018
## 4646 0.148
## 2682 0.334
## 1020 0.492
## UniqueID Popular.fctr
## 4943 4943 N
## 4763 4763 N
## 4771 4771 N
## 2510 2510 N
## 17 17 N
## 4929 4929 N
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.prob
## 4943 0.922
## 4763 0.938
## 4771 0.938
## 2510 0.944
## 17 0.960
## 4929 0.972
## Popular.fctr.predict.Conditional.X.no.rnorm.rf
## 4943 Y
## 4763 Y
## 4771 Y
## 2510 Y
## 17 Y
## 4929 Y
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.accurate
## 4943 FALSE
## 4763 FALSE
## 4771 FALSE
## 2510 FALSE
## 17 FALSE
## 4929 FALSE
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.error
## 4943 0.522
## 4763 0.538
## 4771 0.538
## 2510 0.544
## 17 0.560
## 4929 0.572
# gather predictions from models better than MFO.*
#mdl_id <- "Conditional.X.rf"
#mdl_id <- "Conditional.X.cp.0.rpart"
#mdl_id <- "Conditional.X.rpart"
# glb_OOBent_df <- glb_get_predictions(df=glb_OOBent_df, mdl_id,
# glb_rsp_var_out)
# print(t(confusionMatrix(glb_OOBent_df[, paste0(glb_rsp_var_out, mdl_id)],
# glb_OOBent_df[, glb_rsp_var])$table))
FN_OOB_ids <- c(4721, 4020, 693, 92)
print(glb_OOBent_df[glb_OOBent_df$UniqueID %in% FN_OOB_ids,
grep(glb_rsp_var, names(glb_OOBent_df), value=TRUE)])
## Popular.fctr Popular.fctr.predict.Conditional.X.no.rnorm.rf.prob
## 92 Y 0.004
## 693 Y 0.000
## 4020 Y 0.032
## 4721 Y 0.000
## Popular.fctr.predict.Conditional.X.no.rnorm.rf
## 92 N
## 693 N
## 4020 N
## 4721 N
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.accurate
## 92 FALSE
## 693 FALSE
## 4020 FALSE
## 4721 FALSE
print(glb_OOBent_df[glb_OOBent_df$UniqueID %in% FN_OOB_ids,
glb_feats_df$id[1:5]])
## WordCount.log SubsectionName.nb.fctr PubDate.hour H.num.chars.log
## 92 5.723585 Dealbook 8 4.248495
## 693 5.641907 Small Business 14 3.688879
## 4020 3.610918 Metro::N.Y. / Region 21 4.158883
## 4721 6.030685 Asia Pacific 4 4.189655
## PubDate.minute
## 92 2
## 693 10
## 4020 13
## 4721 23
print(glb_OOBent_df[glb_OOBent_df$UniqueID %in% FN_OOB_ids,
glb_txt_vars])
## Headline
## 92 Moelis & Co. Hires Cantor, Ex-House Majority Leader, as Vice Chairman
## 693 Do You Hire Employees on a Trial Basis?
## 4020 Video: News Conference About Ebola Patient at Bellevue Hospital
## 4721 Hong Kong Politician Likens Protesters to African-American Slaves
## Snippet
## 92 Eric Cantor, who suffered a surprising electoral defeat this year, will be joining Moelis & Company as vice chairman and a director on its board.
## 693 Do you think job candidates are willing to work for three months on a contract before being hired full-time?
## 4020 A news conference about Dr. Craig Spencer at Bellevue Hospital who tested positive for the Ebola virus.
## 4721 A prominent businesswoman and politician has come under fire for saying, erroneously, that black Americans did not get voting rights for 107 years after the countrys slaves were freed, so Hong Kongers should also wait.
## Abstract
## 92 Eric Cantor, who suffered a surprising electoral defeat this year, will be joining Moelis & Company as vice chairman and a director on its board.
## 693 Do you think job candidates are willing to work for three months on a contract before being hired full-time?
## 4020 A news conference about Dr. Craig Spencer at Bellevue Hospital who tested positive for the Ebola virus.
## 4721 A prominent businesswoman and politician has come under fire for saying, erroneously, that black Americans did not get voting rights for 107 years after the countrys slaves were freed, so Hong Kongers should also wait.
write.csv(glb_OOBent_df[, c("UniqueID",
grep(glb_rsp_var, names(glb_OOBent_df), fixed=TRUE, value=TRUE))],
paste0(gsub(".", "_", paste0(glb_out_pfx, glb_sel_mdl_id), fixed=TRUE),
"_OOBent.csv"), row.names=FALSE)
# print(glb_entity_df[glb_entity_df$UniqueID %in% FN_OOB_ids,
# glb_txt_vars])
# dsp_tbl(Headline.contains="[Ee]bola")
# sum(sel_obs(Headline.contains="[Ee]bola"))
# ftable(xtabs(Popular ~ NewsDesk.fctr, data=glb_entity_df[sel_obs(Headline.contains="[Ee]bola") ,]))
# xtabs(NewsDesk ~ Popular, #Popular ~ NewsDesk.fctr,
# data=glb_entity_df[sel_obs(Headline.contains="[Ee]bola") ,],
# exclude=NULL)
# print(mycreate_xtab_df(df=glb_entity_df[sel_obs(Headline.contains="[Ee]bola") ,], c("Popular", "NewsDesk", "SectionName", "SubsectionName")))
# print(mycreate_tbl_df(df=glb_entity_df[sel_obs(Headline.contains="[Ee]bola") ,], c("Popular", "NewsDesk", "SectionName", "SubsectionName")))
# print(mycreate_tbl_df(df=glb_entity_df[sel_obs(Headline.contains="[Ee]bola") ,], c("Popular")))
# print(mycreate_tbl_df(df=glb_entity_df[sel_obs(Headline.contains="[Ee]bola") ,],
# tbl_col_names=c("Popular", "NewsDesk")))
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 11 fit.models 6 2 565.526 580.904 15.378
## 12 fit.models 6 3 580.904 NA NA
sav_entity_df <- glb_entity_df
print(setdiff(names(glb_trnent_df), names(glb_entity_df)))
## character(0)
print(setdiff(names(glb_fitent_df), names(glb_entity_df)))
## character(0)
print(setdiff(names(glb_OOBent_df), names(glb_entity_df)))
## [1] "Popular.fctr.predict.Conditional.X.no.rnorm.rf.prob"
## [2] "Popular.fctr.predict.Conditional.X.no.rnorm.rf"
## [3] "Popular.fctr.predict.Conditional.X.no.rnorm.rf.accurate"
for (col in setdiff(names(glb_OOBent_df), names(glb_entity_df)))
# Merge or cbind ?
glb_entity_df[glb_entity_df$.lcn == "OOB", col] <- glb_OOBent_df[, col]
print(setdiff(names(glb_newent_df), names(glb_entity_df)))
## character(0)
save(glb_feats_df,
glb_entity_df, #glb_trnent_df, glb_fitent_df, glb_OOBent_df, glb_newent_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_sel_mdl, glb_sel_mdl_id,
glb_model_type,
file=paste0(glb_out_pfx, "selmdl_dsk.RData"))
#load(paste0(glb_out_pfx, "selmdl_dsk.RData"))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"model.selected")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 12 fit.models 6 3 580.904 648.457 67.553
## 13 fit.data.training 7 0 648.458 NA NA
7.0: fit data training#load(paste0(glb_inp_pfx, "dsk.RData"))
# To create specific models
# glb_fin_mdl_id <- NULL; glb_fin_mdl <- NULL;
# glb_sel_mdl_id <- "Conditional.X.cp.0.rpart";
# glb_sel_mdl <- glb_models_lst[[glb_sel_mdl_id]]; print(glb_sel_mdl)
if (!is.null(glb_fin_mdl_id) && (glb_fin_mdl_id %in% names(glb_models_lst))) {
warning("Final model same as user selected model")
glb_fin_mdl <- glb_sel_mdl
} else {
print(mdl_feats_df <- myextract_mdl_feats(sel_mdl=glb_sel_mdl,
entity_df=glb_fitent_df))
if ((model_method <- glb_sel_mdl$method) == "custom")
# get actual method from the model_id
model_method <- tail(unlist(strsplit(glb_sel_mdl_id, "[.]")), 1)
tune_finmdl_df <- NULL
if (nrow(glb_sel_mdl$bestTune) > 0) {
for (param in names(glb_sel_mdl$bestTune)) {
#print(sprintf("param: %s", param))
if (glb_sel_mdl$bestTune[1, param] != "none")
tune_finmdl_df <- rbind(tune_finmdl_df,
data.frame(parameter=param,
min=glb_sel_mdl$bestTune[1, param],
max=glb_sel_mdl$bestTune[1, param],
by=1)) # by val does not matter
}
}
# Sync with parameters in mydsutils.R
ret_lst <- myfit_mdl(model_id="Final", model_method=model_method,
indep_vars_vctr=mdl_feats_df$id, model_type=glb_model_type,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_trnent_df, OOB_df=NULL,
n_cv_folds=glb_n_cv_folds, tune_models_df=tune_finmdl_df,
# Automate from here
# Issues if glb_sel_mdl$method == "rf" b/c trainControl is "oob"; not "cv"
model_loss_mtrx=glb_model_metric_terms,
model_summaryFunction=glb_sel_mdl$control$summaryFunction,
model_metric=glb_sel_mdl$metric,
model_metric_maximize=glb_sel_mdl$maximize)
glb_fin_mdl <- glb_models_lst[[length(glb_models_lst)]]
glb_fin_mdl_id <- glb_models_df[length(glb_models_lst), "model_id"]
}
## id importance
## WordCount.log WordCount.log 100.00000000
## SubsectionName.nb.fctr SubsectionName.nb.fctr 80.55962844
## PubDate.hour PubDate.hour 50.40142882
## H.num.chars.log H.num.chars.log 31.50681999
## PubDate.minute PubDate.minute 31.06935062
## SectionName.nb.fctr SectionName.nb.fctr 30.96104916
## S.num.chars.log S.num.chars.log 29.83955854
## A.num.chars.log A.num.chars.log 27.42283005
## NewsDesk.nb.fctr NewsDesk.nb.fctr 27.17840550
## PubDate.second PubDate.second 24.91688135
## H.num.words.log H.num.words.log 12.00527830
## Headline.pfx.fctr Headline.pfx.fctr 11.72217010
## S.num.words.unq.log S.num.words.unq.log 11.60835145
## H.num.words.unq.log H.num.words.unq.log 11.25894100
## A.num.words.unq.log A.num.words.unq.log 11.18544246
## S.num.words.log S.num.words.log 10.97136866
## A.num.words.log A.num.words.log 10.48234438
## H.is.question H.is.question 7.54343001
## PubDate.wkday.fctr PubDate.wkday.fctr 5.06560307
## PubDate.apm.fctr PubDate.apm.fctr 3.72443166
## PubDate.date.fctr PubDate.date.fctr 3.50248117
## S.time S.time 2.28273783
## A.time A.time 2.26463376
## A.one A.one 2.23475110
## S.one S.one 2.19710541
## A.new A.new 2.12350133
## S.new S.new 1.85916578
## A.year A.year 1.73137113
## S.can S.can 1.65012657
## S.year S.year 1.62647972
## H.day H.day 1.60400212
## A.can A.can 1.50097929
## S.will S.will 1.41958637
## A.report A.report 1.38648928
## A.will A.will 1.38263430
## A.state A.state 1.29734430
## S.week S.week 1.29638460
## S.report S.report 1.25986121
## A.said A.said 1.21943045
## S.said S.said 1.20710524
## S.state S.state 1.20583031
## A.compani A.compani 1.19678908
## S.newyork S.newyork 1.15146558
## A.newyork A.newyork 1.14950629
## A.week A.week 1.13858903
## S.compani S.compani 1.12479942
## A.take A.take 1.09050191
## S.take S.take 1.00734157
## A.make A.make 0.93790398
## H.has.ebola H.has.ebola 0.88048418
## S.make S.make 0.86990917
## S.share S.share 0.83850954
## A.share A.share 0.83670420
## S.presid S.presid 0.83648428
## S.show S.show 0.79865434
## A.presid A.presid 0.79431490
## A.show A.show 0.78546924
## S.day S.day 0.74597854
## A.day A.day 0.72495657
## H.new H.new 0.68617761
## S.first S.first 0.49663910
## A.first A.first 0.44866721
## S.intern S.intern 0.44583700
## A.intern A.intern 0.42046341
## H.report H.report 0.41504183
## H.today H.today 0.34780584
## H.week H.week 0.34413627
## H.X2014 H.X2014 0.29048530
## H.newyork H.newyork 0.28817488
## S.articl S.articl 0.19414432
## A.articl A.articl 0.19190230
## H.fashion H.fashion 0.13979364
## A.fashion A.fashion 0.09319559
## S.fashion S.fashion 0.08458414
## [1] "fitting model: Final.rf"
## [1] " indep_vars: WordCount.log, SubsectionName.nb.fctr, PubDate.hour, H.num.chars.log, PubDate.minute, SectionName.nb.fctr, S.num.chars.log, A.num.chars.log, NewsDesk.nb.fctr, PubDate.second, H.num.words.log, Headline.pfx.fctr, S.num.words.unq.log, H.num.words.unq.log, A.num.words.unq.log, S.num.words.log, A.num.words.log, H.is.question, PubDate.wkday.fctr, PubDate.apm.fctr, PubDate.date.fctr, S.time, A.time, A.one, S.one, A.new, S.new, A.year, S.can, S.year, H.day, A.can, S.will, A.report, A.will, A.state, S.week, S.report, A.said, S.said, S.state, A.compani, S.newyork, A.newyork, A.week, S.compani, A.take, S.take, A.make, H.has.ebola, S.make, S.share, A.share, S.presid, S.show, A.presid, A.show, S.day, A.day, H.new, S.first, A.first, S.intern, A.intern, H.report, H.today, H.week, H.X2014, H.newyork, S.articl, A.articl, H.fashion, A.fashion, S.fashion"
## + : mtry=92
## - : mtry=92
## Aggregating results
## Fitting final model on full training set
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 6532 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 13064 matrix numeric
## oob.times 6532 -none- numeric
## classes 2 -none- character
## importance 182 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 6532 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 182 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.28668852
## 2 0.1 0.79897661
## 3 0.2 0.92080876
## 4 0.3 0.97502230
## 5 0.4 0.99954275
## 6 0.5 1.00000000
## 7 0.6 0.99954233
## 8 0.7 0.92895639
## 9 0.8 0.82913765
## 10 0.9 0.62774639
## 11 1.0 0.02171946
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.fit"
## Popular.fctr Popular.fctr.predict.Final.rf.N
## 1 N 5439
## 2 Y NA
## Popular.fctr.predict.Final.rf.Y
## 1 NA
## 2 1093
## Prediction
## Reference N Y
## N 5439 0
## Y 0 1093
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 1.0000000 1.0000000 0.9994354 1.0000000 0.8326699
## AccuracyPValue McnemarPValue
## 0.0000000 NaN
## Warning in mypredict_mdl(mdl, df = fit_df, rsp_var, rsp_var_out,
## model_id_method, : Expecting 1 metric: Accuracy; recd: Accuracy, Kappa;
## retaining Accuracy only
## model_id model_method
## 1 Final.rf rf
## feats
## 1 WordCount.log, SubsectionName.nb.fctr, PubDate.hour, H.num.chars.log, PubDate.minute, SectionName.nb.fctr, S.num.chars.log, A.num.chars.log, NewsDesk.nb.fctr, PubDate.second, H.num.words.log, Headline.pfx.fctr, S.num.words.unq.log, H.num.words.unq.log, A.num.words.unq.log, S.num.words.log, A.num.words.log, H.is.question, PubDate.wkday.fctr, PubDate.apm.fctr, PubDate.date.fctr, S.time, A.time, A.one, S.one, A.new, S.new, A.year, S.can, S.year, H.day, A.can, S.will, A.report, A.will, A.state, S.week, S.report, A.said, S.said, S.state, A.compani, S.newyork, A.newyork, A.week, S.compani, A.take, S.take, A.make, H.has.ebola, S.make, S.share, A.share, S.presid, S.show, A.presid, A.show, S.day, A.day, H.new, S.first, A.first, S.intern, A.intern, H.report, H.today, H.week, H.X2014, H.newyork, S.articl, A.articl, H.fashion, A.fashion, S.fashion
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 256.811 126.356
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 1 0.5 1 0.9116656
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9994354 1 0.6655214
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 13 fit.data.training 7 0 648.458 911.375 262.917
## 14 fit.data.training 7 1 911.375 NA NA
glb_trnent_df <- glb_get_predictions(df=glb_trnent_df, mdl_id=glb_fin_mdl_id,
rsp_var_out=glb_rsp_var_out,
prob_threshold_def=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id, "opt.prob.threshold.OOB"], NULL))
## Warning in glb_get_predictions(df = glb_trnent_df, mdl_id =
## glb_fin_mdl_id, : Using default probability threshold: 0.4
glb_feats_df <- mymerge_feats_importance(feats_df=glb_feats_df, sel_mdl=glb_fin_mdl,
entity_df=glb_trnent_df)
glb_feats_df[, paste0(glb_fin_mdl_id, ".importance")] <- glb_feats_df$importance
print(glb_feats_df)
## id importance cor.y
## WordCount.log WordCount.log 100.00000000 0.265952699
## SubsectionName.nb.fctr SubsectionName.nb.fctr 80.55962844 -0.213860009
## PubDate.hour PubDate.hour 50.40142882 0.159167673
## H.num.chars.log H.num.chars.log 31.50681999 -0.171062360
## PubDate.minute PubDate.minute 31.06935062 -0.031469083
## SectionName.nb.fctr SectionName.nb.fctr 30.96104916 -0.148701209
## S.num.chars.log S.num.chars.log 29.83955854 -0.224692967
## A.num.chars.log A.num.chars.log 27.42283005 -0.224548821
## NewsDesk.nb.fctr NewsDesk.nb.fctr 27.17840550 -0.172482671
## PubDate.second PubDate.second 24.91688135 -0.012253600
## H.num.words.log H.num.words.log 12.00527830 -0.200686356
## Headline.pfx.fctr Headline.pfx.fctr 11.72217010 -0.100052879
## S.num.words.unq.log S.num.words.unq.log 11.60835145 -0.250796919
## H.num.words.unq.log H.num.words.unq.log 11.25894100 -0.204496360
## A.num.words.unq.log A.num.words.unq.log 11.18544246 -0.250601203
## S.num.words.log S.num.words.log 10.97136866 -0.245354135
## A.num.words.log A.num.words.log 10.48234438 -0.245073324
## H.is.question H.is.question 7.54343001 0.129154799
## PubDate.wkday.fctr PubDate.wkday.fctr 5.06560307 -0.039801288
## PubDate.apm.fctr PubDate.apm.fctr 3.72443166 0.101472715
## PubDate.date.fctr PubDate.date.fctr 3.50248117 -0.011647558
## S.time S.time 2.28273783 -0.057595102
## A.time A.time 2.26463376 -0.057790617
## A.one A.one 2.23475110 0.005696039
## S.one S.one 2.19710541 0.006342094
## A.new A.new 2.12350133 -0.035359447
## S.new S.new 1.85916578 -0.034948520
## A.year A.year 1.73137113 -0.051146178
## S.can S.can 1.65012657 0.029999780
## S.year S.year 1.62647972 -0.051146178
## H.day H.day 1.60400212 -0.061669687
## A.can A.can 1.50097929 0.031498867
## S.will S.will 1.41958637 -0.060575493
## A.report A.report 1.38648928 -0.050211524
## A.will A.will 1.38263430 -0.061025004
## A.state A.state 1.29734430 0.005702163
## S.week S.week 1.29638460 -0.084814939
## S.report S.report 1.25986121 -0.050211524
## A.said A.said 1.21943045 0.001363226
## S.said S.said 1.20710524 0.001363226
## S.state S.state 1.20583031 0.006069626
## A.compani A.compani 1.19678908 -0.053099633
## S.newyork S.newyork 1.15146558 -0.062117105
## A.newyork A.newyork 1.14950629 -0.062117105
## A.week A.week 1.13858903 -0.084814939
## S.compani S.compani 1.12479942 -0.053012962
## A.take A.take 1.09050191 -0.026086108
## S.take S.take 1.00734157 -0.025762398
## A.make A.make 0.93790398 0.023138853
## H.has.ebola H.has.ebola 0.88048418 0.025881397
## S.make S.make 0.86990917 0.023138853
## S.share S.share 0.83850954 -0.050329686
## A.share A.share 0.83670420 -0.050329686
## S.presid S.presid 0.83648428 -0.019828826
## S.show S.show 0.79865434 -0.048801740
## A.presid A.presid 0.79431490 -0.019828826
## A.show A.show 0.78546924 -0.048801740
## S.day S.day 0.74597854 -0.045649185
## A.day A.day 0.72495657 -0.045909684
## H.new H.new 0.68617761 -0.053121542
## S.first S.first 0.49663910 -0.053388178
## A.first A.first 0.44866721 -0.053388178
## S.intern S.intern 0.44583700 -0.068485701
## A.intern A.intern 0.42046341 -0.068485701
## H.report H.report 0.41504183 -0.064948102
## H.today H.today 0.34780584 -0.063723058
## H.week H.week 0.34413627 -0.075105216
## H.X2014 H.X2014 0.29048530 -0.046206380
## H.newyork H.newyork 0.28817488 -0.057970095
## S.articl S.articl 0.19414432 -0.059520554
## A.articl A.articl 0.19190230 -0.059520554
## H.fashion H.fashion 0.13979364 -0.081708612
## A.fashion A.fashion 0.09319559 -0.086446251
## S.fashion S.fashion 0.08458414 -0.086446251
## .rnorm .rnorm NA -0.008703337
## A.has.http A.has.http NA -0.013592603
## A.num.chars A.num.chars NA -0.177037425
## A.num.words A.num.words NA -0.204211072
## A.num.words.unq A.num.words.unq NA -0.210242145
## H.daili H.daili NA -0.069192975
## H.has.http H.has.http NA NA
## H.num.chars H.num.chars NA -0.147211183
## H.num.words H.num.words NA -0.186036895
## H.num.words.unq H.num.words.unq NA -0.189702157
## H.X2015 H.X2015 NA -0.066584892
## Popular Popular NA 1.000000000
## Popular.fctr Popular.fctr NA NA
## PubDate.month.fctr PubDate.month.fctr NA 0.019148739
## PubDate.year PubDate.year NA NA
## S.has.http S.has.http NA NA
## S.num.chars S.num.chars NA -0.179331806
## S.num.words S.num.words NA -0.206385049
## S.num.words.unq S.num.words.unq NA -0.212102717
## UniqueID UniqueID NA 0.011824920
## WordCount WordCount NA 0.257526549
## exclude.as.feat cor.y.abs cor.high.X
## WordCount.log FALSE 0.265952699 <NA>
## SubsectionName.nb.fctr FALSE 0.213860009 NewsDesk.nb.fctr
## PubDate.hour FALSE 0.159167673 PubDate.apm.fctr
## H.num.chars.log FALSE 0.171062360 <NA>
## PubDate.minute FALSE 0.031469083 <NA>
## SectionName.nb.fctr FALSE 0.148701209 <NA>
## S.num.chars.log FALSE 0.224692967 A.num.chars.log
## A.num.chars.log FALSE 0.224548821 <NA>
## NewsDesk.nb.fctr FALSE 0.172482671 SectionName.nb.fctr
## PubDate.second FALSE 0.012253600 <NA>
## H.num.words.log FALSE 0.200686356 <NA>
## Headline.pfx.fctr FALSE 0.100052879 <NA>
## S.num.words.unq.log FALSE 0.250796919 S.num.chars.log
## H.num.words.unq.log FALSE 0.204496360 H.num.chars.log
## A.num.words.unq.log FALSE 0.250601203 <NA>
## S.num.words.log FALSE 0.245354135 A.num.words.log
## A.num.words.log FALSE 0.245073324 <NA>
## H.is.question FALSE 0.129154799 <NA>
## PubDate.wkday.fctr FALSE 0.039801288 <NA>
## PubDate.apm.fctr FALSE 0.101472715 <NA>
## PubDate.date.fctr FALSE 0.011647558 <NA>
## S.time FALSE 0.057595102 <NA>
## A.time FALSE 0.057790617 S.time
## A.one FALSE 0.005696039 <NA>
## S.one FALSE 0.006342094 <NA>
## A.new FALSE 0.035359447 S.new
## S.new FALSE 0.034948520 <NA>
## A.year FALSE 0.051146178 S.year
## S.can FALSE 0.029999780 <NA>
## S.year FALSE 0.051146178 <NA>
## H.day FALSE 0.061669687 <NA>
## A.can FALSE 0.031498867 S.can
## S.will FALSE 0.060575493 <NA>
## A.report FALSE 0.050211524 S.report
## A.will FALSE 0.061025004 S.will
## A.state FALSE 0.005702163 <NA>
## S.week FALSE 0.084814939 <NA>
## S.report FALSE 0.050211524 <NA>
## A.said FALSE 0.001363226 <NA>
## S.said FALSE 0.001363226 <NA>
## S.state FALSE 0.006069626 <NA>
## A.compani FALSE 0.053099633 S.compani
## S.newyork FALSE 0.062117105 <NA>
## A.newyork FALSE 0.062117105 S.newyork
## A.week FALSE 0.084814939 S.week
## S.compani FALSE 0.053012962 <NA>
## A.take FALSE 0.026086108 S.take
## S.take FALSE 0.025762398 <NA>
## A.make FALSE 0.023138853 S.make
## H.has.ebola FALSE 0.025881397 <NA>
## S.make FALSE 0.023138853 <NA>
## S.share FALSE 0.050329686 <NA>
## A.share FALSE 0.050329686 S.share
## S.presid FALSE 0.019828826 <NA>
## S.show FALSE 0.048801740 <NA>
## A.presid FALSE 0.019828826 S.presid
## A.show FALSE 0.048801740 S.show
## S.day FALSE 0.045649185 <NA>
## A.day FALSE 0.045909684 S.day
## H.new FALSE 0.053121542 <NA>
## S.first FALSE 0.053388178 <NA>
## A.first FALSE 0.053388178 S.first
## S.intern FALSE 0.068485701 <NA>
## A.intern FALSE 0.068485701 S.intern
## H.report FALSE 0.064948102 <NA>
## H.today FALSE 0.063723058 <NA>
## H.week FALSE 0.075105216 <NA>
## H.X2014 FALSE 0.046206380 <NA>
## H.newyork FALSE 0.057970095 <NA>
## S.articl FALSE 0.059520554 <NA>
## A.articl FALSE 0.059520554 S.articl
## H.fashion FALSE 0.081708612 H.week
## A.fashion FALSE 0.086446251 S.fashion
## S.fashion FALSE 0.086446251 <NA>
## .rnorm FALSE 0.008703337 <NA>
## A.has.http FALSE 0.013592603 <NA>
## A.num.chars TRUE 0.177037425 <NA>
## A.num.words TRUE 0.204211072 <NA>
## A.num.words.unq TRUE 0.210242145 <NA>
## H.daili FALSE 0.069192975 <NA>
## H.has.http FALSE NA <NA>
## H.num.chars TRUE 0.147211183 <NA>
## H.num.words TRUE 0.186036895 <NA>
## H.num.words.unq TRUE 0.189702157 <NA>
## H.X2015 FALSE 0.066584892 <NA>
## Popular TRUE 1.000000000 <NA>
## Popular.fctr TRUE NA <NA>
## PubDate.month.fctr TRUE 0.019148739 <NA>
## PubDate.year TRUE NA <NA>
## S.has.http FALSE NA <NA>
## S.num.chars TRUE 0.179331806 <NA>
## S.num.words TRUE 0.206385049 <NA>
## S.num.words.unq TRUE 0.212102717 <NA>
## UniqueID TRUE 0.011824920 <NA>
## WordCount TRUE 0.257526549 <NA>
## is.ConditionalX.y is.cor.y.abs.low rsp_var_raw
## WordCount.log TRUE FALSE FALSE
## SubsectionName.nb.fctr TRUE FALSE FALSE
## PubDate.hour TRUE FALSE FALSE
## H.num.chars.log TRUE FALSE FALSE
## PubDate.minute TRUE FALSE FALSE
## SectionName.nb.fctr TRUE FALSE FALSE
## S.num.chars.log TRUE FALSE FALSE
## A.num.chars.log TRUE FALSE FALSE
## NewsDesk.nb.fctr TRUE FALSE FALSE
## PubDate.second TRUE FALSE FALSE
## H.num.words.log TRUE FALSE FALSE
## Headline.pfx.fctr TRUE FALSE FALSE
## S.num.words.unq.log TRUE FALSE FALSE
## H.num.words.unq.log TRUE FALSE FALSE
## A.num.words.unq.log TRUE FALSE FALSE
## S.num.words.log TRUE FALSE FALSE
## A.num.words.log TRUE FALSE FALSE
## H.is.question TRUE FALSE FALSE
## PubDate.wkday.fctr TRUE FALSE FALSE
## PubDate.apm.fctr TRUE FALSE FALSE
## PubDate.date.fctr TRUE FALSE FALSE
## S.time TRUE FALSE FALSE
## A.time TRUE FALSE FALSE
## A.one TRUE TRUE FALSE
## S.one TRUE TRUE FALSE
## A.new TRUE FALSE FALSE
## S.new TRUE FALSE FALSE
## A.year TRUE FALSE FALSE
## S.can TRUE FALSE FALSE
## S.year TRUE FALSE FALSE
## H.day TRUE FALSE FALSE
## A.can TRUE FALSE FALSE
## S.will TRUE FALSE FALSE
## A.report TRUE FALSE FALSE
## A.will TRUE FALSE FALSE
## A.state TRUE TRUE FALSE
## S.week TRUE FALSE FALSE
## S.report TRUE FALSE FALSE
## A.said TRUE TRUE FALSE
## S.said TRUE TRUE FALSE
## S.state TRUE TRUE FALSE
## A.compani TRUE FALSE FALSE
## S.newyork TRUE FALSE FALSE
## A.newyork TRUE FALSE FALSE
## A.week TRUE FALSE FALSE
## S.compani TRUE FALSE FALSE
## A.take TRUE FALSE FALSE
## S.take TRUE FALSE FALSE
## A.make TRUE FALSE FALSE
## H.has.ebola TRUE FALSE FALSE
## S.make TRUE FALSE FALSE
## S.share TRUE FALSE FALSE
## A.share TRUE FALSE FALSE
## S.presid TRUE FALSE FALSE
## S.show TRUE FALSE FALSE
## A.presid TRUE FALSE FALSE
## A.show TRUE FALSE FALSE
## S.day TRUE FALSE FALSE
## A.day TRUE FALSE FALSE
## H.new TRUE FALSE FALSE
## S.first TRUE FALSE FALSE
## A.first TRUE FALSE FALSE
## S.intern TRUE FALSE FALSE
## A.intern TRUE FALSE FALSE
## H.report TRUE FALSE FALSE
## H.today TRUE FALSE FALSE
## H.week TRUE FALSE FALSE
## H.X2014 TRUE FALSE FALSE
## H.newyork TRUE FALSE FALSE
## S.articl TRUE FALSE FALSE
## A.articl TRUE FALSE FALSE
## H.fashion TRUE FALSE FALSE
## A.fashion TRUE FALSE FALSE
## S.fashion TRUE FALSE FALSE
## .rnorm TRUE FALSE FALSE
## A.has.http FALSE FALSE FALSE
## A.num.chars NA FALSE FALSE
## A.num.words NA FALSE FALSE
## A.num.words.unq NA FALSE FALSE
## H.daili FALSE FALSE FALSE
## H.has.http FALSE NA FALSE
## H.num.chars NA FALSE FALSE
## H.num.words NA FALSE FALSE
## H.num.words.unq NA FALSE FALSE
## H.X2015 FALSE FALSE FALSE
## Popular NA FALSE TRUE
## Popular.fctr NA NA NA
## PubDate.month.fctr NA FALSE FALSE
## PubDate.year NA NA FALSE
## S.has.http FALSE NA FALSE
## S.num.chars NA FALSE FALSE
## S.num.words NA FALSE FALSE
## S.num.words.unq NA FALSE FALSE
## UniqueID NA FALSE FALSE
## WordCount NA FALSE FALSE
## id_var rsp_var Conditional.X.no.rnorm.rf.importance
## WordCount.log NA NA 100.00000000
## SubsectionName.nb.fctr NA NA 80.55962844
## PubDate.hour NA NA 50.40142882
## H.num.chars.log NA NA 31.50681999
## PubDate.minute NA NA 31.06935062
## SectionName.nb.fctr NA NA 30.96104916
## S.num.chars.log NA NA 29.83955854
## A.num.chars.log NA NA 27.42283005
## NewsDesk.nb.fctr NA NA 27.17840550
## PubDate.second NA NA 24.91688135
## H.num.words.log NA NA 12.00527830
## Headline.pfx.fctr NA NA 11.72217010
## S.num.words.unq.log NA NA 11.60835145
## H.num.words.unq.log NA NA 11.25894100
## A.num.words.unq.log NA NA 11.18544246
## S.num.words.log NA NA 10.97136866
## A.num.words.log NA NA 10.48234438
## H.is.question NA NA 7.54343001
## PubDate.wkday.fctr NA NA 5.06560307
## PubDate.apm.fctr NA NA 3.72443166
## PubDate.date.fctr NA NA 3.50248117
## S.time NA NA 2.28273783
## A.time NA NA 2.26463376
## A.one NA NA 2.23475110
## S.one NA NA 2.19710541
## A.new NA NA 2.12350133
## S.new NA NA 1.85916578
## A.year NA NA 1.73137113
## S.can NA NA 1.65012657
## S.year NA NA 1.62647972
## H.day NA NA 1.60400212
## A.can NA NA 1.50097929
## S.will NA NA 1.41958637
## A.report NA NA 1.38648928
## A.will NA NA 1.38263430
## A.state NA NA 1.29734430
## S.week NA NA 1.29638460
## S.report NA NA 1.25986121
## A.said NA NA 1.21943045
## S.said NA NA 1.20710524
## S.state NA NA 1.20583031
## A.compani NA NA 1.19678908
## S.newyork NA NA 1.15146558
## A.newyork NA NA 1.14950629
## A.week NA NA 1.13858903
## S.compani NA NA 1.12479942
## A.take NA NA 1.09050191
## S.take NA NA 1.00734157
## A.make NA NA 0.93790398
## H.has.ebola NA NA 0.88048418
## S.make NA NA 0.86990917
## S.share NA NA 0.83850954
## A.share NA NA 0.83670420
## S.presid NA NA 0.83648428
## S.show NA NA 0.79865434
## A.presid NA NA 0.79431490
## A.show NA NA 0.78546924
## S.day NA NA 0.74597854
## A.day NA NA 0.72495657
## H.new NA NA 0.68617761
## S.first NA NA 0.49663910
## A.first NA NA 0.44866721
## S.intern NA NA 0.44583700
## A.intern NA NA 0.42046341
## H.report NA NA 0.41504183
## H.today NA NA 0.34780584
## H.week NA NA 0.34413627
## H.X2014 NA NA 0.29048530
## H.newyork NA NA 0.28817488
## S.articl NA NA 0.19414432
## A.articl NA NA 0.19190230
## H.fashion NA NA 0.13979364
## A.fashion NA NA 0.09319559
## S.fashion NA NA 0.08458414
## .rnorm NA NA NA
## A.has.http NA NA NA
## A.num.chars NA NA NA
## A.num.words NA NA NA
## A.num.words.unq NA NA NA
## H.daili NA NA NA
## H.has.http NA NA NA
## H.num.chars NA NA NA
## H.num.words NA NA NA
## H.num.words.unq NA NA NA
## H.X2015 NA NA NA
## Popular NA NA NA
## Popular.fctr NA TRUE NA
## PubDate.month.fctr NA NA NA
## PubDate.year NA NA NA
## S.has.http NA NA NA
## S.num.chars NA NA NA
## S.num.words NA NA NA
## S.num.words.unq NA NA NA
## UniqueID TRUE NA NA
## WordCount NA NA NA
## Final.rf.importance
## WordCount.log 100.00000000
## SubsectionName.nb.fctr 80.55962844
## PubDate.hour 50.40142882
## H.num.chars.log 31.50681999
## PubDate.minute 31.06935062
## SectionName.nb.fctr 30.96104916
## S.num.chars.log 29.83955854
## A.num.chars.log 27.42283005
## NewsDesk.nb.fctr 27.17840550
## PubDate.second 24.91688135
## H.num.words.log 12.00527830
## Headline.pfx.fctr 11.72217010
## S.num.words.unq.log 11.60835145
## H.num.words.unq.log 11.25894100
## A.num.words.unq.log 11.18544246
## S.num.words.log 10.97136866
## A.num.words.log 10.48234438
## H.is.question 7.54343001
## PubDate.wkday.fctr 5.06560307
## PubDate.apm.fctr 3.72443166
## PubDate.date.fctr 3.50248117
## S.time 2.28273783
## A.time 2.26463376
## A.one 2.23475110
## S.one 2.19710541
## A.new 2.12350133
## S.new 1.85916578
## A.year 1.73137113
## S.can 1.65012657
## S.year 1.62647972
## H.day 1.60400212
## A.can 1.50097929
## S.will 1.41958637
## A.report 1.38648928
## A.will 1.38263430
## A.state 1.29734430
## S.week 1.29638460
## S.report 1.25986121
## A.said 1.21943045
## S.said 1.20710524
## S.state 1.20583031
## A.compani 1.19678908
## S.newyork 1.15146558
## A.newyork 1.14950629
## A.week 1.13858903
## S.compani 1.12479942
## A.take 1.09050191
## S.take 1.00734157
## A.make 0.93790398
## H.has.ebola 0.88048418
## S.make 0.86990917
## S.share 0.83850954
## A.share 0.83670420
## S.presid 0.83648428
## S.show 0.79865434
## A.presid 0.79431490
## A.show 0.78546924
## S.day 0.74597854
## A.day 0.72495657
## H.new 0.68617761
## S.first 0.49663910
## A.first 0.44866721
## S.intern 0.44583700
## A.intern 0.42046341
## H.report 0.41504183
## H.today 0.34780584
## H.week 0.34413627
## H.X2014 0.29048530
## H.newyork 0.28817488
## S.articl 0.19414432
## A.articl 0.19190230
## H.fashion 0.13979364
## A.fashion 0.09319559
## S.fashion 0.08458414
## .rnorm NA
## A.has.http NA
## A.num.chars NA
## A.num.words NA
## A.num.words.unq NA
## H.daili NA
## H.has.http NA
## H.num.chars NA
## H.num.words NA
## H.num.words.unq NA
## H.X2015 NA
## Popular NA
## Popular.fctr NA
## PubDate.month.fctr NA
## PubDate.year NA
## S.has.http NA
## S.num.chars NA
## S.num.words NA
## S.num.words.unq NA
## UniqueID NA
## WordCount NA
glb_analytics_diag_plots(obs_df=glb_trnent_df, mdl_id=glb_fin_mdl_id,
prob_threshold=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"], NULL))
## Warning in glb_analytics_diag_plots(obs_df = glb_trnent_df, mdl_id =
## glb_fin_mdl_id, : Limiting important feature scatter plots to 5 out of 74
## [1] "Min/Max Boundaries: "
## UniqueID Popular.fctr Popular.fctr.predict.Final.rf.prob
## 1507 1507 N 0.000
## 6370 6370 Y 0.744
## Popular.fctr.predict.Final.rf Popular.fctr.predict.Final.rf.accurate
## 1507 N TRUE
## 6370 Y TRUE
## Popular.fctr.predict.Final.rf.error .label
## 1507 0 1507
## 6370 0 6370
## [1] "Inaccurate: "
## UniqueID Popular.fctr Popular.fctr.predict.Final.rf.prob
## 2606 2606 N 0.406
## Popular.fctr.predict.Final.rf Popular.fctr.predict.Final.rf.accurate
## 2606 Y FALSE
## Popular.fctr.predict.Final.rf.error
## 2606 0.006
dsp_feats_vctr <- c(NULL)
for(var in grep(".importance", names(glb_feats_df), fixed=TRUE, value=TRUE))
dsp_feats_vctr <- union(dsp_feats_vctr,
glb_feats_df[!is.na(glb_feats_df[, var]), "id"])
print(glb_trnent_df[glb_trnent_df$UniqueID %in% FN_OOB_ids,
grep(glb_rsp_var, names(glb_trnent_df), value=TRUE)])
## Popular.fctr Popular.fctr.predict.Final.rf.prob
## 92 Y 0.638
## 693 Y 0.630
## 4020 Y 0.666
## 4721 Y 0.660
## Popular.fctr.predict.Final.rf
## 92 Y
## 693 Y
## 4020 Y
## 4721 Y
sav_entity_df <- glb_entity_df
print(setdiff(names(glb_trnent_df), names(glb_entity_df)))
## [1] "Popular.fctr.predict.Final.rf.prob"
## [2] "Popular.fctr.predict.Final.rf"
for (col in setdiff(names(glb_trnent_df), names(glb_entity_df)))
# Merge or cbind ?
glb_entity_df[glb_entity_df$.src == "Train", col] <- glb_trnent_df[, col]
print(setdiff(names(glb_fitent_df), names(glb_entity_df)))
## character(0)
print(setdiff(names(glb_OOBent_df), names(glb_entity_df)))
## character(0)
for (col in setdiff(names(glb_OOBent_df), names(glb_entity_df)))
# Merge or cbind ?
glb_entity_df[glb_entity_df$.lcn == "OOB", col] <- glb_OOBent_df[, col]
print(setdiff(names(glb_newent_df), names(glb_entity_df)))
## character(0)
save(glb_feats_df, glb_entity_df,
#glb_trnent_df, glb_fitent_df, glb_OOBent_df, glb_newent_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_model_type,
glb_sel_mdl, glb_sel_mdl_id,
glb_fin_mdl, glb_fin_mdl_id,
file=paste0(glb_out_pfx, "dsk.RData"))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all.prediction","model.final")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
## 3.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: data.training.all.prediction
## 4.0000 5 0 1 1 1
## 4.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: model.final
## 5.0000 4 0 0 2 1
glb_chunks_df <- myadd_chunk(glb_chunks_df, "predict.data.new", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 14 fit.data.training 7 1 911.375 1000.29 88.915
## 15 predict.data.new 8 0 1000.291 NA NA
8.0: predict data new# Compute final model predictions
glb_newent_df <- glb_get_predictions(glb_newent_df, mdl_id=glb_fin_mdl_id,
rsp_var_out=glb_rsp_var_out,
prob_threshold_def=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"], NULL))
## Warning in glb_get_predictions(glb_newent_df, mdl_id = glb_fin_mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
glb_analytics_diag_plots(obs_df=glb_newent_df, mdl_id=glb_fin_mdl_id,
prob_threshold=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"], NULL))
## Warning in glb_analytics_diag_plots(obs_df = glb_newent_df, mdl_id =
## glb_fin_mdl_id, : Limiting important feature scatter plots to 5 out of 74
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## [1] "Min/Max Boundaries: "
## UniqueID Popular.fctr Popular.fctr.predict.Final.rf.prob
## 6753 6753 <NA> 0.530
## 7309 7309 <NA> 0.062
## Popular.fctr.predict.Final.rf Popular.fctr.predict.Final.rf.accurate
## 6753 Y NA
## 7309 N NA
## Popular.fctr.predict.Final.rf.error .label
## 6753 0 6753
## 7309 0 7309
## [1] "Inaccurate: "
## UniqueID Popular.fctr Popular.fctr.predict.Final.rf.prob
## NA NA <NA> NA
## NA.1 NA <NA> NA
## NA.2 NA <NA> NA
## NA.3 NA <NA> NA
## NA.4 NA <NA> NA
## NA.5 NA <NA> NA
## Popular.fctr.predict.Final.rf Popular.fctr.predict.Final.rf.accurate
## NA <NA> NA
## NA.1 <NA> NA
## NA.2 <NA> NA
## NA.3 <NA> NA
## NA.4 <NA> NA
## NA.5 <NA> NA
## Popular.fctr.predict.Final.rf.error
## NA NA
## NA.1 NA
## NA.2 NA
## NA.3 NA
## NA.4 NA
## NA.5 NA
## UniqueID Popular.fctr Popular.fctr.predict.Final.rf.prob
## NA.102 NA <NA> NA
## NA.534 NA <NA> NA
## NA.766 NA <NA> NA
## NA.783 NA <NA> NA
## NA.850 NA <NA> NA
## NA.1829 NA <NA> NA
## Popular.fctr.predict.Final.rf
## NA.102 <NA>
## NA.534 <NA>
## NA.766 <NA>
## NA.783 <NA>
## NA.850 <NA>
## NA.1829 <NA>
## Popular.fctr.predict.Final.rf.accurate
## NA.102 NA
## NA.534 NA
## NA.766 NA
## NA.783 NA
## NA.850 NA
## NA.1829 NA
## Popular.fctr.predict.Final.rf.error
## NA.102 NA
## NA.534 NA
## NA.766 NA
## NA.783 NA
## NA.850 NA
## NA.1829 NA
## UniqueID Popular.fctr Popular.fctr.predict.Final.rf.prob
## NA.1864 NA <NA> NA
## NA.1865 NA <NA> NA
## NA.1866 NA <NA> NA
## NA.1867 NA <NA> NA
## NA.1868 NA <NA> NA
## NA.1869 NA <NA> NA
## Popular.fctr.predict.Final.rf
## NA.1864 <NA>
## NA.1865 <NA>
## NA.1866 <NA>
## NA.1867 <NA>
## NA.1868 <NA>
## NA.1869 <NA>
## Popular.fctr.predict.Final.rf.accurate
## NA.1864 NA
## NA.1865 NA
## NA.1866 NA
## NA.1867 NA
## NA.1868 NA
## NA.1869 NA
## Popular.fctr.predict.Final.rf.error
## NA.1864 NA
## NA.1865 NA
## NA.1866 NA
## NA.1867 NA
## NA.1868 NA
## NA.1869 NA
## Warning: Removed 1870 rows containing missing values (geom_point).
submit_df <- glb_newent_df[, c(glb_id_vars,
paste0(glb_rsp_var_out, glb_fin_mdl_id, ".prob"))]
names(submit_df)[2] <- "Probability1"
write.csv(submit_df,
paste0(gsub(".", "_", paste0(glb_out_pfx, glb_fin_mdl_id), fixed=TRUE),
"_submit.csv"), row.names=FALSE)
# print(orderBy(~ -max.auc.OOB, glb_models_df[, c("model_id",
# "max.auc.OOB", "max.Accuracy.OOB")]))
print(glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"])
## [1] 0.4
print(sprintf("glb_sel_mdl_id: %s", glb_sel_mdl_id))
## [1] "glb_sel_mdl_id: Conditional.X.no.rnorm.rf"
print(sprintf("glb_fin_mdl_id: %s", glb_fin_mdl_id))
## [1] "glb_fin_mdl_id: Final.rf"
print(dim(glb_fitent_df))
## [1] 4475 107
print(dsp_models_df)
## model_id max.Accuracy.OOB max.auc.OOB max.Kappa.OOB
## 11 Conditional.X.no.rnorm.rf 0.9081186 0.9314323 0.6757737
## 9 Conditional.X.glm 0.9027710 0.8173314 0.6451546
## 7 Interact.High.cor.Y.glm 0.8954789 0.9127805 0.6286271
## 8 Low.cor.X.glm 0.8794361 0.9144538 0.5944691
## 10 Conditional.X.no.rnorm.rpart 0.8740885 0.6862731 0.4517912
## 1 MFO.myMFO_classfr 0.8327662 0.5000000 0.0000000
## 3 Max.cor.Y.cv.0.rpart 0.8327662 0.5000000 0.0000000
## 5 Max.cor.Y.rpart 0.8327662 0.5000000 0.0000000
## 4 Max.cor.Y.cv.0.cp.0.rpart 0.7715119 0.6504263 0.2413609
## 6 Max.cor.Y.glm 0.6932426 0.7342331 0.2215049
## 2 Random.myrandom_classfr 0.1672338 0.4821958 0.0000000
## min.aic.fit opt.prob.threshold.OOB
## 11 NA 0.4
## 9 29927.970 0.9
## 7 2504.643 0.3
## 8 2443.257 0.3
## 10 NA 0.7
## 1 NA 0.5
## 3 NA 0.5
## 5 NA 0.5
## 4 NA 0.2
## 6 3674.923 0.2
## 2 NA 0.1
print(sprintf("%s OOB confusion matrix & accuracy: ", glb_sel_mdl_id))
## [1] "Conditional.X.no.rnorm.rf OOB confusion matrix & accuracy: "
print(t(confusionMatrix(glb_OOBent_df[, paste0(glb_rsp_var_out, glb_sel_mdl_id)],
glb_OOBent_df[, glb_rsp_var])$table))
## Prediction
## Reference N Y
## N 1611 102
## Y 87 257
# nOOB_ctgry_df <- mycreate_sqlxtab_df(glb_OOBent_df, c("NewsDesk.nb"))
tmp_OOBent_df <- glb_OOBent_df[, c("NewsDesk.nb", predct_accurate_var_name)]
names(tmp_OOBent_df)[2] <- "accurate.OOB"
aOOB_ctgry_df <- mycreate_xtab_df(tmp_OOBent_df, names(tmp_OOBent_df))
aOOB_ctgry_df[is.na(aOOB_ctgry_df)] <- 0
aOOB_ctgry_df <- mutate(aOOB_ctgry_df,
.n.OOB = accurate.OOB.FALSE + accurate.OOB.TRUE,
max.accuracy.OOB = accurate.OOB.TRUE / .n.OOB)
intersect(names(glb_ctgry_df), names(aOOB_ctgry_df))
## [1] "NewsDesk.nb" ".n.OOB"
glb_ctgry_df <- merge(glb_ctgry_df, aOOB_ctgry_df, all=TRUE)
print(orderBy(~-accurate.OOB.FALSE, glb_ctgry_df))
## NewsDesk.nb .n.OOB .n.Tst .freqRatio.Tst .freqRatio.OOB
## 1 Business 516 500 0.267379679 0.2508507535
## 11 OpEd 217 205 0.109625668 0.1054934370
## 9 myMisc:: 297 247 0.132085561 0.1443850267
## 16 Styles 75 76 0.040641711 0.0364608653
## 14 Science 66 57 0.030481283 0.0320855615
## 2 Culture 203 243 0.129946524 0.0986874088
## 7 Metro 57 66 0.035294118 0.0277102577
## 5 Foreign 121 107 0.057219251 0.0588235294
## 12 Readers Respond:: 2 4 0.002139037 0.0009722897
## 20 TStyle 239 107 0.057219251 0.1161886242
## 8 myEducation 118 93 0.049732620 0.0573650948
## 10 myMultimedia 38 53 0.028342246 0.0184735051
## 3 Daily Clip Report:: 18 22 0.011764706 0.0087506077
## 4 First Draft:: 18 14 0.007486631 0.0087506077
## 6 Magazine 10 3 0.001604278 0.0048614487
## 13 Reporter's Notebook:: 1 7 0.003743316 0.0004861449
## 15 Sports 1 NA NA 0.0004861449
## 17 The Daily Gift:: 1 2 0.001069519 0.0004861449
## 18 Today in Politics:: 14 21 0.011229947 0.0068060282
## 19 Travel 34 31 0.016577540 0.0165289256
## 21 Verbatim:: 11 12 0.006417112 0.0053475936
## accurate.OOB.FALSE accurate.OOB.TRUE max.accuracy.OOB
## 1 56 460 0.8914729
## 11 37 180 0.8294931
## 9 35 262 0.8821549
## 16 21 54 0.7200000
## 14 18 48 0.7272727
## 2 9 194 0.9556650
## 7 5 52 0.9122807
## 5 2 119 0.9834711
## 12 2 0 0.0000000
## 20 2 237 0.9916318
## 8 1 117 0.9915254
## 10 1 37 0.9736842
## 3 0 18 1.0000000
## 4 0 18 1.0000000
## 6 0 10 1.0000000
## 13 0 1 1.0000000
## 15 0 1 1.0000000
## 17 0 1 1.0000000
## 18 0 14 1.0000000
## 19 0 34 1.0000000
## 21 0 11 1.0000000
# OOB_ctgry_df <- mycreate_sqlxtab_df(glb_OOBent_df, c("NewsDesk.nb"))
# print(sum(glb_OOBent_df[, predct_accurate_var_name]) / nrow(glb_OOBent_df))
# print(tapply(glb_OOBent_df[, predct_accurate_var_name],
# glb_OOBent_df[, "NewsDesk.nb"], sum) /
# tapply(glb_OOBent_df[, predct_accurate_var_name],
# glb_OOBent_df[, "NewsDesk.nb"], length))
# acc_df <- as.data.frame(
# tapply(glb_OOBent_df[, predct_accurate_var_name],
# glb_OOBent_df[, "NewsDesk.nb"], sum) /
# tapply(glb_OOBent_df[, predct_accurate_var_name],
# glb_OOBent_df[, "NewsDesk.nb"], length))
# names(acc_df) <- c("max.accuracy.OOB")
# print(orderBy(~-max.accuracy.OOB, acc_df))
dsp_NewsDesk.nb_conf_mtrx <- function(NewsDesk.nb) {
print(sprintf("%s OOB::NewsDesk.nb=%s confusion matrix & accuracy: ",
glb_sel_mdl_id, NewsDesk.nb))
print(t(confusionMatrix(
glb_OOBent_df[glb_OOBent_df$NewsDesk.nb == NewsDesk.nb,
paste0(glb_rsp_var_out, glb_sel_mdl_id)],
glb_OOBent_df[glb_OOBent_df$NewsDesk.nb == NewsDesk.nb, glb_rsp_var])$table))
print(sum(glb_OOBent_df[glb_OOBent_df$NewsDesk.nb == NewsDesk.nb,
predct_accurate_var_name]) /
nrow(glb_OOBent_df[glb_OOBent_df$NewsDesk.nb == NewsDesk.nb, ]))
err_ids <- glb_OOBent_df[(glb_OOBent_df$NewsDesk.nb == NewsDesk.nb) &
(!glb_OOBent_df[, predct_accurate_var_name]), glb_id_vars]
print(sprintf("%s OOB::NewsDesk.nb=%s errors: ", glb_sel_mdl_id, NewsDesk.nb))
print(glb_OOBent_df[glb_OOBent_df$UniqueID %in% err_ids,
c("Headline.pfx", "Headline", "Popular")])
}
#dsp_NewsDesk.nb_conf_mtrx(NewsDesk.nb="Culture")
#dsp_NewsDesk.nb_conf_mtrx(NewsDesk.nb="Foreign")
#dsp_NewsDesk.nb_conf_mtrx(NewsDesk.nb="Metro")
#dsp_NewsDesk.nb_conf_mtrx(NewsDesk.nb="Science")
#dsp_NewsDesk.nb_conf_mtrx(NewsDesk.nb="Styles")
#dsp_NewsDesk.nb_conf_mtrx(NewsDesk.nb="TStyle")
dsp_NewsDesk.nb_conf_mtrx(NewsDesk.nb="myEducation")
## [1] "Conditional.X.no.rnorm.rf OOB::NewsDesk.nb=myEducation confusion matrix & accuracy: "
## Prediction
## Reference N Y
## N 117 1
## Y 0 0
## [1] 0.9915254
## [1] "Conditional.X.no.rnorm.rf OOB::NewsDesk.nb=myEducation errors: "
## Headline.pfx Headline Popular
## 5525 myMisc:: 500 Prompts for Narrative and Personal Writing 0
dsp_NewsDesk.nb_conf_mtrx(NewsDesk.nb="myMultimedia")
## [1] "Conditional.X.no.rnorm.rf OOB::NewsDesk.nb=myMultimedia confusion matrix & accuracy: "
## Prediction
## Reference N Y
## N 37 0
## Y 1 0
## [1] 0.9736842
## [1] "Conditional.X.no.rnorm.rf OOB::NewsDesk.nb=myMultimedia errors: "
## Headline.pfx Headline Popular
## 4012 myMisc:: A Limited View of Boys From the Bronx 1
print(nrow(glb_entity_df[sel_obs(Popular=1, NewsDesk.nb="myMisc::"),]))
## [1] 107
dsp_obs(Popular=1, NewsDesk.nb="myMisc::", all=TRUE)
## UniqueID Popular
## 56 56 1
## 95 95 1
## 130 130 1
## 163 163 1
## 291 291 1
## 317 317 1
## 364 364 1
## 436 436 1
## 444 444 1
## 493 493 1
## 629 629 1
## 685 685 1
## 714 714 1
## 779 779 1
## 809 809 1
## 882 882 1
## 983 983 1
## 1021 1021 1
## 1092 1092 1
## 1132 1132 1
## 1156 1156 1
## 1193 1193 1
## 1312 1312 1
## 1404 1404 1
## 1489 1489 1
## 1526 1526 1
## 1560 1560 1
## 1596 1596 1
## 1702 1702 1
## 1767 1767 1
## 1807 1807 1
## 1871 1871 1
## 2007 2007 1
## 2088 2088 1
## 2123 2123 1
## 2165 2165 1
## 2167 2167 1
## 2196 2196 1
## 2228 2228 1
## 2267 2267 1
## 2319 2319 1
## 2330 2330 1
## 2438 2438 1
## 2512 2512 1
## 2542 2542 1
## 2603 2603 1
## 2725 2725 1
## 2756 2756 1
## 2852 2852 1
## 2873 2873 1
## 2883 2883 1
## 3044 3044 1
## 3074 3074 1
## 3168 3168 1
## 3231 3231 1
## 3235 3235 1
## 3263 3263 1
## 3287 3287 1
## 3368 3368 1
## 3414 3414 1
## 3444 3444 1
## 3472 3472 1
## 3492 3492 1
## 3521 3521 1
## 3574 3574 1
## 3635 3635 1
## 3809 3809 1
## 3851 3851 1
## 3882 3882 1
## 3901 3901 1
## 3908 3908 1
## 4173 4173 1
## 4206 4206 1
## 4265 4265 1
## 4281 4281 1
## 4359 4359 1
## 4398 4398 1
## 4415 4415 1
## 4465 4465 1
## 4540 4540 1
## 4712 4712 1
## 4833 4833 1
## 5038 5038 1
## 5058 5058 1
## 5276 5276 1
## 5285 5285 1
## 5345 5345 1
## 5387 5387 1
## 5649 5649 1
## 5674 5674 1
## 5881 5881 1
## 5884 5884 1
## 5923 5923 1
## 6057 6057 1
## 6159 6159 1
## 6221 6221 1
## 6251 6251 1
## 6252 6252 1
## 6283 6283 1
## 6286 6286 1
## 6327 6327 1
## 6357 6357 1
## 6370 6370 1
## 6395 6395 1
## 6416 6416 1
## 6484 6484 1
## 6497 6497 1
## Headline
## 56 How Can Men Help Prevent Sexual Assault?
## 95 Phrases We Love Too Much
## 130 How a Revelation About Hello Kittys Identity Blew Everyones Mind
## 163 Why Leaked Nude Photos Are Another Frontier for Feminists
## 291 A Debate Fueled by Carbs (or the Lack Thereof)
## 317 What Does the Middle Class Need?
## 364 Joan Rivers and the Front Page
## 436 Should an American Man Wear Shorts?
## 444 A College Education Should Include Rooming With a Stranger
## 493 When Family Dinner Doesnt Satisfy
## 629 When Spell-Check Can’t Help
## 685 What Janay Rice Wants
## 714 Time to Ban Middle School?
## 779 Are Biblical Epics Epically Racist?
## 809 What Does It Mean to Be Scottish?
## 882 How Women Talk About Clothes
## 983 How Keeping a Diary Can Surprise You
## 1021 Are Videos the Future of College Admissions?
## 1092 How to Fake Your Next Vacation
## 1132 Wasted Words
## 1156 Tracking a Microtrend Among Affluent Parents
## 1193 What Blade Runner Got Wrong
## 1312 Boycott the N.F.L.?
## 1404 When Tips Are Not Enough
## 1489 What to Expect From Narendra Modi at the United Nations
## 1526 Why Are We So Obsessed With Gilmore Girls?
## 1560 What Is the Right Way to Travel?
## 1596 Should We Continue to Prosecute Nazi War Criminals?
## 1702 Do the Math
## 1767 Why Asexuals Dont Want to Be Invisible Anymore
## 1807 How Fear of Death Could Make You Splurge
## 1871 A Foreign Policy Turning Point or a Moral-Equivalence Blunder?
## 2007 Can a Social Network Stay Ad-Free?
## 2088 Narendra Modi, in U.N. Speech, Inserts India Into Terrorism Fight
## 2123 Dont Do the Things You Love
## 2165 California Is Burning
## 2167 Netanyahu Links Hamas With ISIS, and Equates ISIS With Iran
## 2196 Beware of Joy
## 2228 Why Are Republicans in Favor of Over-the-Counter Birth Control?
## 2267 If You Have Unlimited Vacation, Will You Take It?
## 2319 Close but Not Quite
## 2330 Fighting Human Nature
## 2438 The Cost of Being Cool
## 2512 Steven Salaita and the Quagmire of Academic Freedom
## 2542 Do You Have Time to Read This Story?
## 2603 Shamed, Flamed, Harassed: What Its Like To Be Called Fat Online
## 2725 Do We Get Less Narcissistic as We Get Older?
## 2756 Is Catalonia Spains Scotland?
## 2852 When Education Brings Depression
## 2873 Ugly Disagreements
## 2883 In the Face of Ebola, Stay Calm
## 3044 Joe Bidens Latest Gaffe: the Truth or an Ultimate Embarrassment?
## 3074 Inside the Bounds of a Hasidic Neighborhood
## 3168 Why Is Leon Panetta Throwing the President Under the Bus?
## 3231 Email Prompts In-Depth Look at Community Colleges
## 3235 Columbus Day, or Indigenous Peoples Day?
## 3263 Discussion of Pedophilia Turns Heated
## 3287 Women Fight ISIS and Sexism in Kurdish Regions
## 3368 The Slang Patrol
## 3414 Me and David Greenglass, a Man Whose Name Is Synonymous With Betrayal
## 3444 Is Obama a Great President or a Drag on the Democratic Ticket?
## 3472 Yes Means Yes: The Big Consent Debate
## 3492 Life and Death Through the Eyes of an Ebola Nurse
## 3521 Why Indias Muslims Havent Radicalized
## 3574 What Is a Nobel Prize Really Worth?
## 3635 Who Was Right About W.M.D.s in Iraq?
## 3809 How Brain Myths Could Hurt Kids
## 3851 On Its 20th Anniversary, Does Pulp Fiction Hold Up?
## 3882 To Die at Home: Reporters Notebook
## 3901 Tangled Passages
## 3908 Has Ebola Exposed a Strain of Racism?
## 4173 Should a Child Offender Be Treated as an Adult?
## 4206 Is a Sculpture That Resembles a Sex Toy a National Scandal?
## 4265 Steering the Climate Change Coverage
## 4281 What Jian Ghomeshis Accusers Were Afraid Of
## 4359 Taylor Swifts Unwelcome P.R. Campaign
## 4398 Can You Expect Comity or Conflict in a Republican-Controlled Senate?
## 4415 Phrases We Love Too Much
## 4465 If David Byrne Does Not Care About Contemporary Art, Should We?
## 4540 1983 | Having Claimed 558 Lives, AIDS Finally Made It to the Front Page
## 4712 Is It Voter Fraud or Voter Suppression in 2014?
## 4833 Why Do We Still Care About the Confederate Flag?
## 5038 Why Migraines Deserve More Attention
## 5058 Sexual Harassment at Yale: Delicate Subject, High-Impact Investigation
## 5276 The Benefits of Being Politically Correct
## 5285 Please. Don't 'Decry' the 'Divorcée.' Or Give Us Your 'CV.' The Times Guide to Modern Usage
## 5345 Should Your Child Play Football?
## 5387 Bright Passages
## 5649 How to Be French (When Asking for a Cigarette)
## 5674 How Did Obama Lose American Voters?
## 5881 Should You Pack Your Childs Lunch?
## 5884 Brooklyn, Planet Earth
## 5923 How OkCupid Has Become More Inclusive on Gender and Sexuality
## 6057 When Are You Not Working?
## 6159 Does the University of Virginia Have a Culture of Silence Around Sexual Assault?
## 6221 Brown Family's Lawyer Criticizes Process
## 6251 Ferguson and Other Cities React to Grand Jury Decision Not to Indict Darren Wilson
## 6252 Was Chuck Hagel a Failure or a Scapegoat?
## 6283 Can Brain Science Be Dangerous?
## 6286 A Quiet Wedding for Darren Wilson
## 6327 What Ferguson Says About the Fear of Social Media
## 6357 What Big Thing Would Reinvigorate the Democratic Party?
## 6370 Latest Updates: Protests Nationwide as More Troops Are Called to Ferguson
## 6395 What If Were Wrong About Depression?
## 6416 Another Word for Mr. President? Obamas Action on Immigration Draws Suggestions
## 6484 Does Your Job Make You Happy?
## 6497 What Jim Webb Will Bring to the Presidential Race in 2016
print("FN_OOB_ids:")
## [1] "FN_OOB_ids:"
print(glb_OOBent_df[glb_OOBent_df$UniqueID %in% FN_OOB_ids,
grep(glb_rsp_var, names(glb_OOBent_df), value=TRUE)])
## Popular.fctr Popular.fctr.predict.Conditional.X.no.rnorm.rf.prob
## 92 Y 0.004
## 693 Y 0.000
## 4020 Y 0.032
## 4721 Y 0.000
## Popular.fctr.predict.Conditional.X.no.rnorm.rf
## 92 N
## 693 N
## 4020 N
## 4721 N
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.accurate
## 92 FALSE
## 693 FALSE
## 4020 FALSE
## 4721 FALSE
print(glb_OOBent_df[glb_OOBent_df$UniqueID %in% FN_OOB_ids,
glb_txt_vars])
## Headline
## 92 Moelis & Co. Hires Cantor, Ex-House Majority Leader, as Vice Chairman
## 693 Do You Hire Employees on a Trial Basis?
## 4020 Video: News Conference About Ebola Patient at Bellevue Hospital
## 4721 Hong Kong Politician Likens Protesters to African-American Slaves
## Snippet
## 92 Eric Cantor, who suffered a surprising electoral defeat this year, will be joining Moelis & Company as vice chairman and a director on its board.
## 693 Do you think job candidates are willing to work for three months on a contract before being hired full-time?
## 4020 A news conference about Dr. Craig Spencer at Bellevue Hospital who tested positive for the Ebola virus.
## 4721 A prominent businesswoman and politician has come under fire for saying, erroneously, that black Americans did not get voting rights for 107 years after the countrys slaves were freed, so Hong Kongers should also wait.
## Abstract
## 92 Eric Cantor, who suffered a surprising electoral defeat this year, will be joining Moelis & Company as vice chairman and a director on its board.
## 693 Do you think job candidates are willing to work for three months on a contract before being hired full-time?
## 4020 A news conference about Dr. Craig Spencer at Bellevue Hospital who tested positive for the Ebola virus.
## 4721 A prominent businesswoman and politician has come under fire for saying, erroneously, that black Americans did not get voting rights for 107 years after the countrys slaves were freed, so Hong Kongers should also wait.
print(dsp_vctr <- colSums(glb_OOBent_df[glb_OOBent_df$UniqueID %in% FN_OOB_ids,
setdiff(grep("[HSA].", names(glb_OOBent_df), value=TRUE),
union(myfind_chr_cols_df(glb_OOBent_df),
grep(".fctr", names(glb_OOBent_df), fixed=TRUE, value=TRUE)))]))
## H.X2014 H.X2015 H.daili
## 0.000000 0.000000 0.000000
## H.day H.fashion H.new
## 0.000000 0.000000 0.000000
## H.newyork H.report H.today
## 0.000000 0.000000 0.000000
## H.week H.has.http H.has.ebola
## 0.000000 0.000000 1.000000
## H.is.question H.num.chars H.num.words
## 1.000000 236.000000 26.000000
## H.num.words.unq H.num.chars.log H.num.words.log
## 26.000000 16.285913 7.965546
## H.num.words.unq.log S.articl S.can
## 7.965546 0.000000 0.000000
## S.compani S.day S.fashion
## 1.000000 0.000000 0.000000
## S.first S.intern S.make
## 0.000000 0.000000 0.000000
## S.new S.newyork S.one
## 0.000000 0.000000 0.000000
## S.presid S.report S.said
## 0.000000 0.000000 0.000000
## S.share S.show S.state
## 0.000000 0.000000 0.000000
## S.take S.time S.week
## 0.000000 0.000000 0.000000
## S.will S.year S.has.http
## 2.000000 2.000000 0.000000
## S.num.chars S.num.words S.num.words.unq
## 574.000000 56.000000 56.000000
## S.num.chars.log S.num.words.log S.num.words.unq.log
## 19.708417 10.659422 10.659422
## A.articl A.can A.compani
## 0.000000 0.000000 1.000000
## A.day A.fashion A.first
## 0.000000 0.000000 0.000000
## A.intern A.make A.new
## 0.000000 0.000000 0.000000
## A.newyork A.one A.presid
## 0.000000 0.000000 0.000000
## A.report A.said A.share
## 0.000000 0.000000 0.000000
## A.show A.state A.take
## 0.000000 0.000000 0.000000
## A.time A.week A.will
## 0.000000 0.000000 2.000000
## A.year A.has.http A.num.chars
## 2.000000 0.000000 574.000000
## A.num.words A.num.words.unq A.num.chars.log
## 56.000000 56.000000 19.708417
## A.num.words.log A.num.words.unq.log
## 10.659422 10.659422
dsp_hdlpfx_results <- function(hdlpfx) {
print(hdlpfx)
print(glb_OOBent_df[glb_OOBent_df$Headline.pfx %in% c(hdlpfx),
grep(glb_rsp_var, names(glb_OOBent_df), value=TRUE)])
print(glb_newent_df[glb_newent_df$Headline.pfx %in% c(hdlpfx),
grep(glb_rsp_var, names(glb_newent_df), value=TRUE)])
print(dsp_vctr <- colSums(glb_newent_df[glb_newent_df$Headline.pfx %in% c(hdlpfx),
setdiff(grep("[HSA].", names(glb_newent_df), value=TRUE),
union(myfind_chr_cols_df(glb_newent_df),
grep(".fctr", names(glb_newent_df), fixed=TRUE, value=TRUE)))]))
print(dsp_vctr <- dsp_vctr[dsp_vctr != 0])
print(glb_newent_df[glb_newent_df$Headline.pfx %in% c(hdlpfx),
union(names(dsp_vctr), myfind_chr_cols_df(glb_newent_df))])
}
dsp_hdlpfx_results("Ask Well::")
## [1] "Ask Well::"
## Popular.fctr Popular.fctr.predict.Conditional.X.no.rnorm.rf.prob
## 1053 Y 0.600
## 3228 Y 0.482
## 3437 Y 0.344
## 3602 N 0.556
## 4134 N 0.476
## 4217 N 0.468
## 4387 Y 0.382
## 5244 N 0.666
## 5658 Y 0.486
## Popular.fctr.predict.Conditional.X.no.rnorm.rf
## 1053 Y
## 3228 Y
## 3437 N
## 3602 Y
## 4134 Y
## 4217 Y
## 4387 N
## 5244 Y
## 5658 Y
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.accurate
## 1053 TRUE
## 3228 TRUE
## 3437 FALSE
## 3602 FALSE
## 4134 FALSE
## 4217 FALSE
## 4387 FALSE
## 5244 FALSE
## 5658 TRUE
## Popular.fctr Popular.fctr.predict.Final.rf.prob
## 6558 <NA> 0.374
## 7535 <NA> 0.534
## 7864 <NA> 0.570
## Popular.fctr.predict.Final.rf
## 6558 N
## 7535 Y
## 7864 Y
## H.X2014 H.X2015 H.daili
## 0.000000 0.000000 0.000000
## H.day H.fashion H.new
## 0.000000 0.000000 0.000000
## H.newyork H.report H.today
## 0.000000 0.000000 0.000000
## H.week H.has.http H.has.ebola
## 0.000000 0.000000 0.000000
## H.is.question H.num.chars H.num.words
## 0.000000 108.000000 17.000000
## H.num.words.unq H.num.chars.log H.num.words.log
## 16.000000 10.653204 5.634790
## H.num.words.unq.log S.articl S.can
## 5.480639 0.000000 0.000000
## S.compani S.day S.fashion
## 0.000000 0.000000 0.000000
## S.first S.intern S.make
## 0.000000 0.000000 1.000000
## S.new S.newyork S.one
## 0.000000 0.000000 0.000000
## S.presid S.report S.said
## 0.000000 0.000000 0.000000
## S.share S.show S.state
## 0.000000 0.000000 0.000000
## S.take S.time S.week
## 0.000000 0.000000 0.000000
## S.will S.year S.has.http
## 0.000000 0.000000 0.000000
## S.num.chars S.num.words S.num.words.unq
## 193.000000 22.000000 20.000000
## S.num.chars.log S.num.words.log S.num.words.unq.log
## 12.507177 6.340359 6.089045
## A.articl A.can A.compani
## 0.000000 0.000000 0.000000
## A.day A.fashion A.first
## 0.000000 0.000000 0.000000
## A.intern A.make A.new
## 0.000000 1.000000 0.000000
## A.newyork A.one A.presid
## 0.000000 0.000000 0.000000
## A.report A.said A.share
## 0.000000 0.000000 0.000000
## A.show A.state A.take
## 0.000000 0.000000 0.000000
## A.time A.week A.will
## 0.000000 0.000000 0.000000
## A.year A.has.http A.num.chars
## 0.000000 0.000000 193.000000
## A.num.words A.num.words.unq A.num.chars.log
## 22.000000 20.000000 12.507177
## A.num.words.log A.num.words.unq.log
## 6.340359 6.089045
## H.num.chars H.num.words H.num.words.unq
## 108.000000 17.000000 16.000000
## H.num.chars.log H.num.words.log H.num.words.unq.log
## 10.653204 5.634790 5.480639
## S.make S.num.chars S.num.words
## 1.000000 193.000000 22.000000
## S.num.words.unq S.num.chars.log S.num.words.log
## 20.000000 12.507177 6.340359
## S.num.words.unq.log A.make A.num.chars
## 6.089045 1.000000 193.000000
## A.num.words A.num.words.unq A.num.chars.log
## 22.000000 20.000000 12.507177
## A.num.words.log A.num.words.unq.log
## 6.340359 6.089045
## H.num.chars H.num.words H.num.words.unq H.num.chars.log
## 6558 51 7 7 3.951244
## 7535 21 4 4 3.091042
## 7864 36 6 5 3.610918
## H.num.words.log H.num.words.unq.log S.make S.num.chars S.num.words
## 6558 2.079442 2.079442 0 64 8
## 7535 1.609438 1.609438 1 53 6
## 7864 1.945910 1.791759 0 76 8
## S.num.words.unq S.num.chars.log S.num.words.log S.num.words.unq.log
## 6558 8 4.174387 2.197225 2.197225
## 7535 6 3.988984 1.945910 1.945910
## 7864 6 4.343805 2.197225 1.945910
## A.make A.num.chars A.num.words A.num.words.unq A.num.chars.log
## 6558 0 64 8 8 4.174387
## 7535 1 53 6 6 3.988984
## 7864 0 76 8 6 4.343805
## A.num.words.log A.num.words.unq.log NewsDesk SectionName
## 6558 2.197225 2.197225 Science Health
## 7535 1.945910 1.945910 Science Health
## 7864 2.197225 1.945910 Science Health
## SubsectionName Headline
## 6558 Ask Well: Eating Fat to Boost Vitamin D and Calcium
## 7535 Ask Well: Noisy Knees
## 7864 Ask Well: Wild Fish vs. Farmed Fish
## Snippet
## 6558 A reader asks: Must you eat fat to absorb calcium and vitamin D?
## 7535 A reader asks: Why do my knees make a cracking sound?
## 7864 A reader asks: Is eating farm-raised fish better than eating no fish at all?
## Abstract
## 6558 A reader asks: Must you eat fat to absorb calcium and vitamin D?
## 7535 A reader asks: Why do my knees make a cracking sound?
## 7864 A reader asks: Is eating farm-raised fish better than eating no fish at all?
## PubDate .src Headline.pfx NewsDesk.nb SectionName.nb
## 6558 2014-12-01 16:27:30 Test Ask Well:: Science Health
## 7535 2014-12-15 18:53:06 Test Ask Well:: Science Health
## 7864 2014-12-18 13:42:52 Test Ask Well:: Science Health
## SubsectionName.nb
## 6558 Science::Health
## 7535 Science::Health
## 7864 Science::Health
print("myMisc::|OpEd|blank|blank|1:")
## [1] "myMisc::|OpEd|blank|blank|1:"
print(glb_OOBent_df[glb_OOBent_df$UniqueID %in% c(6446),
grep(glb_rsp_var, names(glb_OOBent_df), value=TRUE)])
## Popular.fctr Popular.fctr.predict.Conditional.X.no.rnorm.rf.prob
## 6446 Y 0.736
## Popular.fctr.predict.Conditional.X.no.rnorm.rf
## 6446 Y
## Popular.fctr.predict.Conditional.X.no.rnorm.rf.accurate
## 6446 TRUE
# print(glb_OOBent_df[glb_OOBent_df$UniqueID %in% FN_OOB_ids,
# c("WordCount", "WordCount.log", "myMultimedia",
# "NewsDesk", "SectionName", "SubsectionName")])
# print(mycreate_sqlxtab_df(glb_entity_df[sel_obs(Headline.contains="[Vv]ideo"), ],
# c(glb_rsp_var, "myMultimedia")))
# dsp_chisq.test(Headline.contains="[Vi]deo")
# print(glb_entity_df[sel_obs(Headline.contains="[Vv]ideo"),
# c(glb_rsp_var, "Popular", "myMultimedia", "Headline")])
# print(glb_entity_df[sel_obs(Headline.contains="[Ee]bola", Popular=1),
# c(glb_rsp_var, "Popular", "myMultimedia", "Headline",
# "NewsDesk", "SectionName", "SubsectionName")])
print(subset(glb_feats_df, !is.na(importance))[,
c("is.ConditionalX.y",
grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
## is.ConditionalX.y importance
## WordCount.log TRUE 100.00000000
## SubsectionName.nb.fctr TRUE 80.55962844
## PubDate.hour TRUE 50.40142882
## H.num.chars.log TRUE 31.50681999
## PubDate.minute TRUE 31.06935062
## SectionName.nb.fctr TRUE 30.96104916
## S.num.chars.log TRUE 29.83955854
## A.num.chars.log TRUE 27.42283005
## NewsDesk.nb.fctr TRUE 27.17840550
## PubDate.second TRUE 24.91688135
## H.num.words.log TRUE 12.00527830
## Headline.pfx.fctr TRUE 11.72217010
## S.num.words.unq.log TRUE 11.60835145
## H.num.words.unq.log TRUE 11.25894100
## A.num.words.unq.log TRUE 11.18544246
## S.num.words.log TRUE 10.97136866
## A.num.words.log TRUE 10.48234438
## H.is.question TRUE 7.54343001
## PubDate.wkday.fctr TRUE 5.06560307
## PubDate.apm.fctr TRUE 3.72443166
## PubDate.date.fctr TRUE 3.50248117
## S.time TRUE 2.28273783
## A.time TRUE 2.26463376
## A.one TRUE 2.23475110
## S.one TRUE 2.19710541
## A.new TRUE 2.12350133
## S.new TRUE 1.85916578
## A.year TRUE 1.73137113
## S.can TRUE 1.65012657
## S.year TRUE 1.62647972
## H.day TRUE 1.60400212
## A.can TRUE 1.50097929
## S.will TRUE 1.41958637
## A.report TRUE 1.38648928
## A.will TRUE 1.38263430
## A.state TRUE 1.29734430
## S.week TRUE 1.29638460
## S.report TRUE 1.25986121
## A.said TRUE 1.21943045
## S.said TRUE 1.20710524
## S.state TRUE 1.20583031
## A.compani TRUE 1.19678908
## S.newyork TRUE 1.15146558
## A.newyork TRUE 1.14950629
## A.week TRUE 1.13858903
## S.compani TRUE 1.12479942
## A.take TRUE 1.09050191
## S.take TRUE 1.00734157
## A.make TRUE 0.93790398
## H.has.ebola TRUE 0.88048418
## S.make TRUE 0.86990917
## S.share TRUE 0.83850954
## A.share TRUE 0.83670420
## S.presid TRUE 0.83648428
## S.show TRUE 0.79865434
## A.presid TRUE 0.79431490
## A.show TRUE 0.78546924
## S.day TRUE 0.74597854
## A.day TRUE 0.72495657
## H.new TRUE 0.68617761
## S.first TRUE 0.49663910
## A.first TRUE 0.44866721
## S.intern TRUE 0.44583700
## A.intern TRUE 0.42046341
## H.report TRUE 0.41504183
## H.today TRUE 0.34780584
## H.week TRUE 0.34413627
## H.X2014 TRUE 0.29048530
## H.newyork TRUE 0.28817488
## S.articl TRUE 0.19414432
## A.articl TRUE 0.19190230
## H.fashion TRUE 0.13979364
## A.fashion TRUE 0.09319559
## S.fashion TRUE 0.08458414
## Conditional.X.no.rnorm.rf.importance
## WordCount.log 100.00000000
## SubsectionName.nb.fctr 80.55962844
## PubDate.hour 50.40142882
## H.num.chars.log 31.50681999
## PubDate.minute 31.06935062
## SectionName.nb.fctr 30.96104916
## S.num.chars.log 29.83955854
## A.num.chars.log 27.42283005
## NewsDesk.nb.fctr 27.17840550
## PubDate.second 24.91688135
## H.num.words.log 12.00527830
## Headline.pfx.fctr 11.72217010
## S.num.words.unq.log 11.60835145
## H.num.words.unq.log 11.25894100
## A.num.words.unq.log 11.18544246
## S.num.words.log 10.97136866
## A.num.words.log 10.48234438
## H.is.question 7.54343001
## PubDate.wkday.fctr 5.06560307
## PubDate.apm.fctr 3.72443166
## PubDate.date.fctr 3.50248117
## S.time 2.28273783
## A.time 2.26463376
## A.one 2.23475110
## S.one 2.19710541
## A.new 2.12350133
## S.new 1.85916578
## A.year 1.73137113
## S.can 1.65012657
## S.year 1.62647972
## H.day 1.60400212
## A.can 1.50097929
## S.will 1.41958637
## A.report 1.38648928
## A.will 1.38263430
## A.state 1.29734430
## S.week 1.29638460
## S.report 1.25986121
## A.said 1.21943045
## S.said 1.20710524
## S.state 1.20583031
## A.compani 1.19678908
## S.newyork 1.15146558
## A.newyork 1.14950629
## A.week 1.13858903
## S.compani 1.12479942
## A.take 1.09050191
## S.take 1.00734157
## A.make 0.93790398
## H.has.ebola 0.88048418
## S.make 0.86990917
## S.share 0.83850954
## A.share 0.83670420
## S.presid 0.83648428
## S.show 0.79865434
## A.presid 0.79431490
## A.show 0.78546924
## S.day 0.74597854
## A.day 0.72495657
## H.new 0.68617761
## S.first 0.49663910
## A.first 0.44866721
## S.intern 0.44583700
## A.intern 0.42046341
## H.report 0.41504183
## H.today 0.34780584
## H.week 0.34413627
## H.X2014 0.29048530
## H.newyork 0.28817488
## S.articl 0.19414432
## A.articl 0.19190230
## H.fashion 0.13979364
## A.fashion 0.09319559
## S.fashion 0.08458414
## Final.rf.importance
## WordCount.log 100.00000000
## SubsectionName.nb.fctr 80.55962844
## PubDate.hour 50.40142882
## H.num.chars.log 31.50681999
## PubDate.minute 31.06935062
## SectionName.nb.fctr 30.96104916
## S.num.chars.log 29.83955854
## A.num.chars.log 27.42283005
## NewsDesk.nb.fctr 27.17840550
## PubDate.second 24.91688135
## H.num.words.log 12.00527830
## Headline.pfx.fctr 11.72217010
## S.num.words.unq.log 11.60835145
## H.num.words.unq.log 11.25894100
## A.num.words.unq.log 11.18544246
## S.num.words.log 10.97136866
## A.num.words.log 10.48234438
## H.is.question 7.54343001
## PubDate.wkday.fctr 5.06560307
## PubDate.apm.fctr 3.72443166
## PubDate.date.fctr 3.50248117
## S.time 2.28273783
## A.time 2.26463376
## A.one 2.23475110
## S.one 2.19710541
## A.new 2.12350133
## S.new 1.85916578
## A.year 1.73137113
## S.can 1.65012657
## S.year 1.62647972
## H.day 1.60400212
## A.can 1.50097929
## S.will 1.41958637
## A.report 1.38648928
## A.will 1.38263430
## A.state 1.29734430
## S.week 1.29638460
## S.report 1.25986121
## A.said 1.21943045
## S.said 1.20710524
## S.state 1.20583031
## A.compani 1.19678908
## S.newyork 1.15146558
## A.newyork 1.14950629
## A.week 1.13858903
## S.compani 1.12479942
## A.take 1.09050191
## S.take 1.00734157
## A.make 0.93790398
## H.has.ebola 0.88048418
## S.make 0.86990917
## S.share 0.83850954
## A.share 0.83670420
## S.presid 0.83648428
## S.show 0.79865434
## A.presid 0.79431490
## A.show 0.78546924
## S.day 0.74597854
## A.day 0.72495657
## H.new 0.68617761
## S.first 0.49663910
## A.first 0.44866721
## S.intern 0.44583700
## A.intern 0.42046341
## H.report 0.41504183
## H.today 0.34780584
## H.week 0.34413627
## H.X2014 0.29048530
## H.newyork 0.28817488
## S.articl 0.19414432
## A.articl 0.19190230
## H.fashion 0.13979364
## A.fashion 0.09319559
## S.fashion 0.08458414
print(subset(glb_feats_df, is.ConditionalX.y & is.na(importance))[,
c("is.ConditionalX.y",
grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
## is.ConditionalX.y importance Conditional.X.no.rnorm.rf.importance
## .rnorm TRUE NA NA
## Final.rf.importance
## .rnorm NA
sav_entity_df <- glb_entity_df
print(setdiff(names(glb_trnent_df), names(glb_entity_df)))
## character(0)
for (col in setdiff(names(glb_trnent_df), names(glb_entity_df)))
# Merge or cbind ?
glb_entity_df[glb_entity_df$.src == "Train", col] <- glb_trnent_df[, col]
print(setdiff(names(glb_fitent_df), names(glb_entity_df)))
## character(0)
print(setdiff(names(glb_OOBent_df), names(glb_entity_df)))
## character(0)
for (col in setdiff(names(glb_OOBent_df), names(glb_entity_df)))
# Merge or cbind ?
glb_entity_df[glb_entity_df$.lcn == "OOB", col] <- glb_OOBent_df[, col]
print(setdiff(names(glb_newent_df), names(glb_entity_df)))
## character(0)
save(glb_feats_df, glb_entity_df,
#glb_trnent_df, glb_fitent_df, glb_OOBent_df, glb_newent_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_model_type,
glb_sel_mdl, glb_sel_mdl_id,
glb_fin_mdl, glb_fin_mdl_id,
file=paste0(glb_out_pfx, "prdnew_dsk.RData"))
# tmp_replay_lst <- replay.petrisim(pn=glb_analytics_pn,
# replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
# "data.new.prediction")), flip_coord=TRUE)
# print(ggplot.petrinet(tmp_replay_lst[["pn"]]) + coord_flip())
glb_chunks_df <- myadd_chunk(glb_chunks_df, "display.session.info", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 15 predict.data.new 8 0 1000.291 1084.539 84.248
## 16 display.session.info 9 0 1084.540 NA NA
Null Hypothesis (\(\sf{H_{0}}\)): mpg is not impacted by am_fctr.
The variance by am_fctr appears to be independent. #{r q1, cache=FALSE} # print(t.test(subset(cars_df, am_fctr == "automatic")$mpg, # subset(cars_df, am_fctr == "manual")$mpg, # var.equal=FALSE)$conf) # We reject the null hypothesis i.e. we have evidence to conclude that am_fctr impacts mpg (95% confidence). Manual transmission is better for miles per gallon versus automatic transmission.
## label step_major step_minor bgn end elapsed
## 10 fit.models 6 1 214.381 565.526 351.145
## 13 fit.data.training 7 0 648.458 911.375 262.917
## 6 extract.features 3 0 62.116 154.693 92.578
## 14 fit.data.training 7 1 911.375 1000.290 88.915
## 15 predict.data.new 8 0 1000.291 1084.539 84.248
## 12 fit.models 6 3 580.904 648.457 67.553
## 9 fit.models 6 0 166.467 214.381 47.914
## 3 cleanse.data 2 1 28.451 56.654 28.203
## 2 inspect.data 2 0 10.394 28.450 18.056
## 11 fit.models 6 2 565.526 580.904 15.378
## 7 select.features 4 0 154.694 165.031 10.337
## 4 manage.missing.data 2 2 56.654 62.058 5.404
## 8 partition.data.training 5 0 165.032 166.466 1.434
## 1 import.data 1 0 9.123 10.393 1.270
## 5 encode.data 2 3 62.059 62.115 0.057
## duration
## 10 351.145
## 13 262.917
## 6 92.577
## 14 88.915
## 15 84.248
## 12 67.553
## 9 47.914
## 3 28.203
## 2 18.056
## 11 15.378
## 7 10.337
## 4 5.404
## 8 1.434
## 1 1.270
## 5 0.056
## [1] "Total Elapsed Time: 1,084.539 secs"
## R version 3.1.3 (2015-03-09)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.3 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] tcltk grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] randomForest_4.6-10 rpart.plot_1.5.2 rpart_4.1-9
## [4] ROCR_1.0-7 gplots_2.16.0 caTools_1.17.1
## [7] caret_6.0-41 tm_0.6 NLP_0.1-6
## [10] mice_2.22 lattice_0.20-31 Rcpp_0.11.5
## [13] plyr_1.8.1 sqldf_0.4-10 RSQLite_1.0.0
## [16] DBI_0.3.1 gsubfn_0.6-6 proto_0.3-10
## [19] reshape2_1.4.1 doBy_4.5-13 survival_2.38-1
## [22] ggplot2_1.0.1
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 BradleyTerry2_1.0-6 brglm_0.5-9
## [4] car_2.0-25 chron_2.3-45 class_7.3-12
## [7] codetools_0.2-11 colorspace_1.2-6 compiler_3.1.3
## [10] digest_0.6.8 e1071_1.6-4 evaluate_0.5.5
## [13] foreach_1.4.2 formatR_1.1 gdata_2.13.3
## [16] gtable_0.1.2 gtools_3.4.1 htmltools_0.2.6
## [19] iterators_1.0.7 KernSmooth_2.23-14 knitr_1.9
## [22] labeling_0.3 lme4_1.1-7 MASS_7.3-40
## [25] Matrix_1.2-0 mgcv_1.8-6 minqa_1.2.4
## [28] munsell_0.4.2 nlme_3.1-120 nloptr_1.0.4
## [31] nnet_7.3-9 parallel_3.1.3 pbkrtest_0.4-2
## [34] quantreg_5.11 RColorBrewer_1.1-2 rmarkdown_0.5.1
## [37] scales_0.2.4 slam_0.1-32 SparseM_1.6
## [40] splines_3.1.3 stringr_0.6.2 tools_3.1.3
## [43] yaml_2.1.13